sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,6 @@ llama_add_compile_flags()
 if (EMSCRIPTEN)
 else()
    add_subdirectory(batched)
-    add_subdirectory(debug)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)

@@ -35,6 +34,7 @@ else()
    add_subdirectory(gen-docs)
    add_subdirectory(training)
    add_subdirectory(diffusion)
+    add_subdirectory(model-conversion)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

@@ -68,7 +68,7 @@ int main(int argc, char ** argv) {
    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;

-    std::vector<llama_sampler_seq_config> sampler_configs;
+    std::vector<llama_sampler *> samplers;

    for (int32_t i = 0; i < n_parallel; ++i) {
        llama_sampler * smpl = llama_sampler_chain_init(sparams);
@@ -78,12 +78,7 @@ int main(int argc, char ** argv) {
        llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
        llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));

-        sampler_configs.push_back({ i, smpl });
-    }
-
-    if (params.sampling.backend_sampling) {
-        ctx_params.samplers   = sampler_configs.data();
-        ctx_params.n_samplers = sampler_configs.size();
+        samplers.push_back(smpl);
    }

    llama_context * ctx = llama_init_from_model(model, ctx_params);
@@ -185,7 +180,7 @@ int main(int argc, char ** argv) {
                continue;
            }

-            const llama_token new_token_id = llama_sampler_sample(sampler_configs[i].sampler, ctx, i_batch[i]);
+            const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);

            // is it an end of generation? -> mark the stream as finished
            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
@@ -241,15 +236,15 @@ int main(int argc, char ** argv) {
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

    LOG("\n");
-    llama_perf_sampler_print(sampler_configs[0].sampler);
+    llama_perf_sampler_print(samplers[0]);
    llama_perf_context_print(ctx);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

-    for (auto & sampler_config : sampler_configs) {
-        llama_sampler_free(sampler_config.sampler);
+    for (auto & sampler_config : samplers) {
+        llama_sampler_free(sampler_config);
    }

    llama_free(ctx);
--- a/examples/debug/README.md
+++ b/examples/debug/README.md
@@ -1,54 +0,0 @@
-# llama.cpp/examples/debug
-
-This is a utility intended to help debug a model by registering a callback that
-logs GGML operations and tensor data. It can also store the generated logits or
-embeddings as well as the prompt and token ids for comparision with the original
-model.
-
-### Usage
-
-```shell
-llama-debug \
-  --hf-repo ggml-org/models \
-  --hf-file phi-2/ggml-model-q4_0.gguf \
-  --model phi-2-q4_0.gguf \
-  --prompt hello \
-  --save-logits \
-  --verbose
-```
-The tensor data is logged as debug and required the --verbose flag. The reason
-for this is that while useful for a model with many layers there can be a lot of
-output. You can filter the tensor names using the `--tensor-filter` option.
-
-A recommended approach is to first run without `--verbose` and see if the
-generated logits/embeddings are close to the original model. If they are not,
-then it might be required to inspect tensor by tensor and in that case it is
-useful to enable the `--verbose` flag along with `--tensor-filter` to focus on
-specific tensors.
-
-### Options
-This example supports all standard `llama.cpp` options and also accepts the
-following options:
-```console
-$ llama-debug --help
-...
-
----- example-specific params -----
-
--save-logits                           save final logits to files for verification (default: false)
--logits-output-dir PATH                directory for saving logits output files (default: data)
--tensor-filter REGEX                   filter tensor names for debug output (regex pattern, can be specified multiple times)
-```
-
-### Output Files
-
-When `--save-logits` is enabled, the following files are created in the output
-directory:
-
-* `llamacpp-<model>[-embeddings].bin`        - Binary output (logits or embeddings)
-* `llamacpp-<model>[-embeddings].txt`        - Text output (logits or embeddings, one per line)
-* `llamacpp-<model>[-embeddings]-prompt.txt` - Prompt text and token IDs
-* `llamacpp-<model>[-embeddings]-tokens.bin` - Binary token IDs for programmatic comparison
-
-These files can be compared against the original model's output to verify the
-converted model.
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -1,253 +0,0 @@
-#include "debug.h"
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-
-#include <cstdlib>
-#include <string>
-#include <vector>
-#include <filesystem>
-#include <fstream>
-#include <regex>
-
-static void print_usage(int /*argc*/, char ** argv) {
-    const std::string usage_template = R"(
-        example usage:
-
-          Print tensors:
-
-          {prog} -m model.gguf -p "Hello my name is" --verbose
-
-          The tensors to be printed can be filtered with --tensor-filter option.
-
-          Save logits/embeddings:
-
-          {prog} -m model.gguf -p "Hello my name is" --save-logits
-
-          Add --embedding to save embeddings)" "\n";
-
-    // Fix the source code indentation above that is introduced by the raw string literal.
-    std::string usage = std::regex_replace(usage_template, std::regex("\\n {8}"), "\n");
-    usage = std::regex_replace(usage, std::regex("\\{prog\\}"), argv[0]);
-    LOG("%s\n", usage.c_str());
-}
-
-static bool has_pooling(llama_context * ctx) {
-    switch (llama_pooling_type(ctx)) {
-        case LLAMA_POOLING_TYPE_NONE:
-        case LLAMA_POOLING_TYPE_UNSPECIFIED:
-            return false;
-        default:
-            return true;
-    }
-}
-
-struct output_data {
-    float *                  data_ptr    = nullptr;
-    int                      data_size   = 0;
-    std::string              type_suffix;
-    std::vector<float>       embd_norm;
-    std::string              prompt;
-    std::vector<llama_token> tokens;
-
-    output_data(llama_context * ctx, const llama_model * model, const common_params & params) {
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-        const bool add_bos = llama_vocab_get_add_bos(vocab);
-
-        tokens = common_tokenize(ctx, params.prompt, add_bos);
-        prompt = params.prompt;
-
-        if (params.embedding) {
-            const int n_embd       = llama_model_n_embd_out(model);
-            const bool pooling     = has_pooling(ctx);
-            const int n_embd_count = pooling ? 1 : tokens.size();
-            const int n_floats     = n_embd * n_embd_count;
-
-            float * embd_raw = pooling ? llama_get_embeddings_seq(ctx, 0) : llama_get_embeddings(ctx);
-            if (embd_raw == nullptr) {
-                throw std::runtime_error("failed to get embeddings from the model");
-            }
-
-            LOG_DBG("pooling_enabled: %s\n", pooling ? "true" : "false");
-            LOG_DBG("n_embd: %d\n", n_embd);
-            LOG_DBG("n_floats: %d\n", n_floats);
-            LOG_DBG("n_embd_count: %d\n", n_embd_count);
-
-            data_ptr    = embd_raw;
-            data_size   = n_floats;
-            type_suffix = "-embeddings";
-
-            if (params.embd_normalize >= 0) {
-                embd_norm.resize(n_floats);
-                for (int i = 0; i < n_embd_count; i++) {
-                    common_embd_normalize(embd_raw+i*n_embd, embd_norm.data()+i*n_embd, n_embd, params.embd_normalize);
-                }
-                data_ptr = embd_norm.data();
-            }
-        } else {
-            const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1);
-            const int n_logits = llama_vocab_n_tokens(vocab);
-
-            data_ptr = const_cast<float*>(logits);
-            data_size = n_logits;
-            type_suffix = "";
-        }
-    }
-};
-
-static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
-    std::filesystem::create_directory(output_dir);
-    auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
-
-    // Save logits/embeddings to binary file.
-    {
-        std::filesystem::path filepath{base_path.string() + ".bin"};
-        std::ofstream file{filepath, std::ios::binary};
-        if (!file) {
-            throw std::runtime_error("failed to open binary output file: " + filepath.string());
-        }
-        file.write(reinterpret_cast<const char*>(output.data_ptr), output.data_size * sizeof(float));
-        LOG("Data saved to %s\n", filepath.c_str());
-    }
-
-    // Save logits/embeddings to text file.
-    {
-        std::filesystem::path filepath{base_path.string() + ".txt"};
-        std::ofstream file{filepath};
-        if (!file) {
-            throw std::runtime_error("failed to open text output file: " + filepath.string());
-        }
-        for (int i = 0; i < output.data_size; i++) {
-            file << i << ": " << output.data_ptr[i] << '\n';
-        }
-        LOG("Data saved to %s\n", filepath.c_str());
-    }
-
-    // Save prompt and tokens to text file.
-    {
-        std::filesystem::path filepath{base_path.string() + "-prompt.txt"};
-        std::ofstream file{filepath};
-        if (!file) {
-            throw std::runtime_error("failed to open prompt output file: " + filepath.string());
-        }
-
-        file << "prompt: " << output.prompt << '\n';
-        file << "n_tokens: " << output.tokens.size() << '\n';
-
-        file << "token ids: ";
-        for (size_t i = 0; i < output.tokens.size(); i++) {
-            file << output.tokens[i];
-            if (i + 1 < output.tokens.size()) {
-                file << ", ";
-            }
-        }
-        file << '\n';
-        LOG("Prompt saved to %s\n", filepath.c_str());
-    }
-
-    // Save token ids to binary file.
-    {
-        std::filesystem::path filepath{base_path.string() + "-tokens.bin"};
-        std::ofstream file{filepath, std::ios::binary};
-        if (!file) {
-            throw std::runtime_error("failed to open tokens binary file: " + filepath.string());
-        }
-        file.write(reinterpret_cast<const char*>(output.tokens.data()), output.tokens.size() * sizeof(llama_token));
-        LOG("Tokens saved to %s\n", filepath.c_str());
-    }
-
-}
-
-static void print_tokenized_prompt(llama_context * ctx, const std::vector<llama_token> & tokens, const std::string & prompt) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    LOG("Model add_bos: %s\n", llama_vocab_get_add_bos(vocab) ? "true" : "false");
-    LOG("Input prompt: \"%s\"\n", prompt.c_str());
-    LOG("Token ids (%zu):\n", tokens.size());
-
-    for (auto id : tokens) {
-        std::string piece(128, '\0');
-        int n = llama_token_to_piece(vocab, id, piece.data(), piece.size(), 0, true);
-        if (n < 0) {
-            LOG_ERR("failed to convert token %d to piece\n", id);
-            continue;
-        }
-        piece.resize(n);
-        LOG("%s(%d) ", piece.c_str(), id);
-    }
-    LOG("\n");
-}
-
-static bool run(llama_context * ctx, const common_params & params) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const bool add_bos = llama_vocab_get_add_bos(vocab);
-
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
-
-    if (tokens.empty()) {
-        LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
-        return false;
-    }
-
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
-        LOG_ERR("%s : failed to eval\n", __func__);
-        return false;
-    }
-
-    print_tokenized_prompt(ctx, tokens, params.prompt);
-
-    if (params.save_logits) {
-        output_data output {ctx, model, params};
-        std::filesystem::path model_path{params.model.path};
-        std::string model_name{model_path.stem().string()};
-        save_output_data(output, model_name, params.logits_output_dir);
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    base_callback_data cb_data(params, params.tensor_filter);
-
-    auto llama_init = common_init_from_params(params);
-
-    auto * model = llama_init->model();
-    auto * ctx   = llama_init->context();
-
-    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
-        return 1;
-    }
-
-    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-        LOG_INF("\n");
-    }
-
-    if (!run(ctx, params)) {
-        return 1;
-    }
-
-    LOG("\n");
-    llama_perf_context_print(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -553,7 +553,6 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers       = params.n_gpu_layers;
    model_params.devices            = params.devices.data();
    model_params.use_mmap           = params.use_mmap;
-    model_params.use_direct_io      = params.use_direct_io;
    model_params.use_mlock          = params.use_mlock;
    model_params.check_tensors      = params.check_tensors;

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
    }
 }

-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);

    // clear previous kv_cache values (irrelevant for embeddings)
@@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
        }

-        float * out = output + embd_pos * n_embd_out;
-        common_embd_normalize(embd, out, n_embd_out, embd_norm);
+        float * out = output + embd_pos * n_embd;
+        common_embd_normalize(embd, out, n_embd, embd_norm);
    }
 }

@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
    }

    // allocate output
-    const int n_embd_out = llama_model_n_embd_out(model);
-    std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
+    const int n_embd = llama_model_n_embd(model);
+    std::vector<float> embeddings(n_embd_count * n_embd, 0);
    float * emb = embeddings.data();

    // break into batches
@@ -267,8 +267,8 @@ int main(int argc, char ** argv) {

        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
-            float * out = emb + e * n_embd_out;
-            batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
+            float * out = emb + e * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
            s = 0;
            common_batch_clear(batch);
@@ -280,8 +280,8 @@ int main(int argc, char ** argv) {
    }

    // final batch
-    float * out = emb + e * n_embd_out;
-    batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
+    float * out = emb + e * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);

    if (params.embd_out.empty()) {
        LOG("\n");
@@ -289,19 +289,19 @@ int main(int argc, char ** argv) {
        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
            for (int j = 0; j < n_embd_count; j++) {
                LOG("embedding %d: ", j);
-                for (int i = 0; i < std::min(3, n_embd_out); i++) {
+                for (int i = 0; i < std::min(3, n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd_out + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd_out + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
                LOG(" ... ");
-                for (int i = n_embd_out - 3; i < n_embd_out; i++) {
+                for (int i = n_embd - 3; i < n_embd; i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd_out + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd_out + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
                LOG("\n");
@@ -320,9 +320,9 @@ int main(int argc, char ** argv) {
                for (uint32_t i = 0; i < n_cls_out; i++) {
                    // NOTE: if you change this log - update the tests in ci/run.sh
                    if (n_cls_out == 1) {
-                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
                    } else {
-                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
                    }
                }
            }
@@ -330,11 +330,11 @@ int main(int argc, char ** argv) {
            // print the first part of the embeddings or for a single prompt, the full embedding
            for (int j = 0; j < n_prompts; j++) {
                LOG("embedding %d: ", j);
-                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
+                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd_out + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd_out + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                    }
                }
                LOG("\n");
@@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
                LOG("\n");
                for (int i = 0; i < n_prompts; i++) {
                    for (int j = 0; j < n_prompts; j++) {
-                        float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
+                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                        LOG("%6.2f ", sim);
                    }
                    LOG("%1.10s", prompts[i].c_str());
@@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
            LOG("[");
            for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                i++;
-                if (i < n_embd_out) LOG(","); else break;
+                if (i < n_embd) LOG(","); else break;
            }
            LOG(notArray ? "]\n    }" : "]");
            j++;
@@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
                LOG("    [");
                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
-                    float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
+                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                    LOG("%6.2f", sim);
                    j++;
                    if (j < n_embd_count) LOG(", "); else break;
@@ -397,7 +397,7 @@ int main(int argc, char ** argv) {

        if (notArray) LOG("\n}\n");
    } else if (params.embd_out == "raw") {
-        print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
+        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
    }

    LOG("\n");
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -4,23 +4,12 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

-if(LLAMA_BUILD_TESTS)
-    if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-        set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf")
-        set(MODEL_HASH "SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739")
-    else()
-        set(MODEL_NAME "tinyllamas/stories15M-be.Q4_0.gguf")
-        set(MODEL_HASH "SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d")
-    endif()
-    set(MODEL_DEST "${CMAKE_BINARY_DIR}/${MODEL_NAME}")
-    set(TEST_TARGET test-eval-callback)
-    add_test(NAME ${TEST_TARGET}-download-model COMMAND ${CMAKE_COMMAND}
-        -DDEST=${MODEL_DEST}
-        -DNAME=${MODEL_NAME}
-        -DHASH=${MODEL_HASH}
-        -P ${CMAKE_SOURCE_DIR}/cmake/download-models.cmake
-    )
-    set_tests_properties(${TEST_TARGET}-download-model PROPERTIES FIXTURES_SETUP ${TEST_TARGET}-download-model)
-    add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${MODEL_DEST}" --prompt hello --seed 42 -ngl 0)
-    set_tests_properties(${TEST_TARGET} PROPERTIES FIXTURES_REQUIRED ${TEST_TARGET}-download-model)
+set(TEST_TARGET test-eval-callback)
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        add_test(NAME ${TEST_TARGET}
+                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+else()
+        add_test(NAME ${TEST_TARGET}
+                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
 endif()
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,12 +1,165 @@
 #include "arg.h"
 #include "common.h"
-#include "debug.h"
 #include "log.h"
 #include "llama.h"
-#include "llama-cpp.h"
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
 #include <string>
 #include <vector>

+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+    std::vector<uint8_t> data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+    float v;
+    if (type == GGML_TYPE_F16) {
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+    } else if (type == GGML_TYPE_F32) {
+        v = *(const float *) &data[i];
+    } else if (type == GGML_TYPE_I64) {
+        v = (float) *(const int64_t *) &data[i];
+    } else if (type == GGML_TYPE_I32) {
+        v = (float) *(const int32_t *) &data[i];
+    } else if (type == GGML_TYPE_I16) {
+        v = (float) *(const int16_t *) &data[i];
+    } else if (type == GGML_TYPE_I8) {
+        v = (float) *(const int8_t *) &data[i];
+    } else if (type == GGML_TYPE_BF16) {
+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+    return v;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    sum += v;
+                }
+            }
+        }
+    }
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        LOG("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                LOG("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            LOG("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    LOG("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                LOG("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        LOG("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    LOG("%12.4f", v);
+                    if (i0 < ne[0] - 1) LOG(", ");
+                }
+                LOG("],\n");
+            }
+            LOG("                                      ],\n");
+        }
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
+    }
+
+    // TODO: make this abort configurable/optional?
+    if (std::isnan(sum)) {
+        LOG_ERR("encountered NaN - aborting\n");
+        exit(0);
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true; // Always retrieve data
+    }
+
+    char src1_str[128] = {0};
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    }
+
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
+
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type)) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
 static bool run(llama_context * ctx, const common_params & params) {
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);
@@ -29,7 +182,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 }

 int main(int argc, char ** argv) {
-    base_callback_data cb_data;
+    callback_data cb_data;

    common_params params;

@@ -44,7 +197,7 @@ int main(int argc, char ** argv) {

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    params.cb_eval = common_debug_cb_eval<false>;
+    params.cb_eval = ggml_debug;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -41,8 +41,11 @@ android {
        }
    }
    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_17
-        targetCompatibility = JavaVersion.VERSION_17
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = "1.8"
    }
 }

--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -6,7 +6,6 @@ import android.util.Log
 import android.widget.EditText
 import android.widget.TextView
 import android.widget.Toast
-import androidx.activity.addCallback
 import androidx.activity.enableEdgeToEdge
 import androidx.activity.result.contract.ActivityResultContracts
 import androidx.appcompat.app.AppCompatActivity
@@ -19,7 +18,6 @@ import com.arm.aichat.gguf.GgufMetadata
 import com.arm.aichat.gguf.GgufMetadataReader
 import com.google.android.material.floatingactionbutton.FloatingActionButton
 import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.Job
 import kotlinx.coroutines.flow.onCompletion
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.withContext
@@ -38,7 +36,6 @@ class MainActivity : AppCompatActivity() {

    // Arm AI Chat inference engine
    private lateinit var engine: InferenceEngine
-    private var generationJob: Job? = null

    // Conversation states
    private var isModelReady = false
@@ -50,13 +47,11 @@ class MainActivity : AppCompatActivity() {
        super.onCreate(savedInstanceState)
        enableEdgeToEdge()
        setContentView(R.layout.activity_main)
-        // View model boilerplate and state management is out of this basic sample's scope
-        onBackPressedDispatcher.addCallback { Log.w(TAG, "Ignore back press for simplicity") }

        // Find views
        ggufTv = findViewById(R.id.gguf)
        messagesRv = findViewById(R.id.messages)
-        messagesRv.layoutManager = LinearLayoutManager(this).apply { stackFromEnd = true }
+        messagesRv.layoutManager = LinearLayoutManager(this)
        messagesRv.adapter = messageAdapter
        userInputEt = findViewById(R.id.user_input)
        userActionFab = findViewById(R.id.fab)
@@ -162,35 +157,33 @@ class MainActivity : AppCompatActivity() {
     * Validate and send the user message into [InferenceEngine]
     */
    private fun handleUserInput() {
-        userInputEt.text.toString().also { userMsg ->
-            if (userMsg.isEmpty()) {
+        userInputEt.text.toString().also { userSsg ->
+            if (userSsg.isEmpty()) {
                Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show()
            } else {
                userInputEt.text = null
-                userInputEt.isEnabled = false
                userActionFab.isEnabled = false

                // Update message states
-                messages.add(Message(UUID.randomUUID().toString(), userMsg, true))
+                messages.add(Message(UUID.randomUUID().toString(), userSsg, true))
                lastAssistantMsg.clear()
                messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false))

-                generationJob = lifecycleScope.launch(Dispatchers.Default) {
-                    engine.sendUserPrompt(userMsg)
+                lifecycleScope.launch(Dispatchers.Default) {
+                    engine.sendUserPrompt(userSsg)
                        .onCompletion {
                            withContext(Dispatchers.Main) {
-                                userInputEt.isEnabled = true
                                userActionFab.isEnabled = true
                            }
                        }.collect { token ->
+                            val messageCount = messages.size
+                            check(messageCount > 0 && !messages[messageCount - 1].isUser)
+
+                            messages.removeAt(messageCount - 1).copy(
+                                content = lastAssistantMsg.append(token).toString()
+                            ).let { messages.add(it) }
+
                            withContext(Dispatchers.Main) {
-                                val messageCount = messages.size
-                                check(messageCount > 0 && !messages[messageCount - 1].isUser)
-
-                                messages.removeAt(messageCount - 1).copy(
-                                    content = lastAssistantMsg.append(token).toString()
-                                ).let { messages.add(it) }
-
                                messageAdapter.notifyItemChanged(messages.size - 1)
                            }
                        }
@@ -202,7 +195,6 @@ class MainActivity : AppCompatActivity() {
    /**
     * Run a benchmark with the model file
     */
-    @Deprecated("This benchmark doesn't accurately indicate GUI performance expected by app developers")
    private suspend fun runBenchmark(modelName: String, modelFile: File) =
        withContext(Dispatchers.Default) {
            Log.i(TAG, "Starts benchmarking $modelName")
@@ -231,16 +223,6 @@ class MainActivity : AppCompatActivity() {
            if (!it.exists()) { it.mkdir() }
        }

-    override fun onStop() {
-        generationJob?.cancel()
-        super.onStop()
-    }
-
-    override fun onDestroy() {
-        engine.destroy()
-        super.onDestroy()
-    }
-
    companion object {
        private val TAG = MainActivity::class.java.simpleName

--- a/examples/llama.android/app/src/main/res/layout/activity_main.xml
+++ b/examples/llama.android/app/src/main/res/layout/activity_main.xml
@@ -24,7 +24,7 @@
                android:id="@+id/gguf"
                android:layout_width="match_parent"
                android:layout_height="wrap_content"
-                android:padding="16dp"
+                android:layout_margin="16dp"
                android:text="Selected GGUF model's metadata will show here."
                style="@style/TextAppearance.MaterialComponents.Body2" />

@@ -33,7 +33,8 @@
        <com.google.android.material.divider.MaterialDivider
            android:layout_width="match_parent"
            android:layout_height="2dp"
-            android:layout_marginHorizontal="16dp" />
+            android:layout_marginHorizontal="16dp"
+            android:layout_marginVertical="8dp" />

        <androidx.recyclerview.widget.RecyclerView
            android:id="@+id/messages"
--- a/examples/llama.android/gradle/libs.versions.toml
+++ b/examples/llama.android/gradle/libs.versions.toml
@@ -1,15 +1,15 @@
 [versions]

 # Plugins
-agp = "8.13.2"
-kotlin = "2.3.0"
+agp = "8.13.0"
+kotlin = "2.2.20"

 # AndroidX
-activity = "1.12.2"
+activity = "1.11.0"
 appcompat = "1.7.1"
 core-ktx = "1.17.0"
 constraint-layout = "2.2.1"
-datastore-preferences = "1.2.0"
+datastore-preferences = "1.1.7"

 # Material
 material = "1.13.0"
--- a/examples/llama.android/lib/build.gradle.kts
+++ b/examples/llama.android/lib/build.gradle.kts
@@ -26,7 +26,7 @@ android {

                arguments += "-DBUILD_SHARED_LIBS=ON"
                arguments += "-DLLAMA_BUILD_COMMON=ON"
-                arguments += "-DLLAMA_OPENSSL=OFF"
+                arguments += "-DLLAMA_CURL=OFF"

                arguments += "-DGGML_NATIVE=OFF"
                arguments += "-DGGML_BACKEND_DL=ON"
--- a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
+++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
@@ -560,6 +560,6 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_unload(JNIEnv * /*unused*/, job

 extern "C"
 JNIEXPORT void JNICALL
-Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *, jobject /*unused*/) {
+Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *env, jobject /*unused*/) {
    llama_backend_free();
 }
--- a/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt
@@ -38,7 +38,7 @@ interface InferenceEngine {
    /**
     * Unloads the currently loaded model.
     */
-    fun cleanUp()
+    suspend fun cleanUp()

    /**
     * Cleans up resources when the engine is no longer needed.
--- a/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt
@@ -15,11 +15,9 @@ import kotlinx.coroutines.cancel
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.StateFlow
-import kotlinx.coroutines.flow.asStateFlow
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.flowOn
 import kotlinx.coroutines.launch
-import kotlinx.coroutines.runBlocking
 import kotlinx.coroutines.withContext
 import java.io.File
 import java.io.IOException
@@ -111,11 +109,9 @@ internal class InferenceEngineImpl private constructor(

    private val _state =
        MutableStateFlow<InferenceEngine.State>(InferenceEngine.State.Uninitialized)
-    override val state: StateFlow<InferenceEngine.State> = _state.asStateFlow()
+    override val state: StateFlow<InferenceEngine.State> = _state

    private var _readyForSystemPrompt = false
-    @Volatile
-    private var _cancelGeneration = false

    /**
     * Single-threaded coroutine dispatcher & scope for LLama asynchronous operations
@@ -173,8 +169,6 @@ internal class InferenceEngineImpl private constructor(
                }
                Log.i(TAG, "Model loaded!")
                _readyForSystemPrompt = true
-
-                _cancelGeneration = false
                _state.value = InferenceEngine.State.ModelReady
            } catch (e: Exception) {
                Log.e(TAG, (e.message ?: "Error loading model") + "\n" + pathToModel, e)
@@ -237,19 +231,15 @@ internal class InferenceEngineImpl private constructor(

            Log.i(TAG, "User prompt processed. Generating assistant prompt...")
            _state.value = InferenceEngine.State.Generating
-            while (!_cancelGeneration) {
+            while (true) {
                generateNextToken()?.let { utf8token ->
                    if (utf8token.isNotEmpty()) emit(utf8token)
                } ?: break
            }
-            if (_cancelGeneration) {
-                Log.i(TAG, "Assistant generation aborted per requested.")
-            } else {
-                Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
-            }
+            Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
            _state.value = InferenceEngine.State.ModelReady
        } catch (e: CancellationException) {
-            Log.i(TAG, "Assistant generation's flow collection cancelled.")
+            Log.i(TAG, "Generation cancelled by user.")
            _state.value = InferenceEngine.State.ModelReady
            throw e
        } catch (e: Exception) {
@@ -278,9 +268,8 @@ internal class InferenceEngineImpl private constructor(
    /**
     * Unloads the model and frees resources, or reset error states
     */
-    override fun cleanUp() {
-        _cancelGeneration = true
-        runBlocking(llamaDispatcher) {
+    override suspend fun cleanUp() =
+        withContext(llamaDispatcher) {
            when (val state = _state.value) {
                is InferenceEngine.State.ModelReady -> {
                    Log.i(TAG, "Unloading model and free resources...")
@@ -304,21 +293,17 @@ internal class InferenceEngineImpl private constructor(
                else -> throw IllegalStateException("Cannot unload model in ${state.javaClass.simpleName}")
            }
        }
-    }

    /**
     * Cancel all ongoing coroutines and free GGML backends
     */
    override fun destroy() {
-        _cancelGeneration = true
-        runBlocking(llamaDispatcher) {
-            _readyForSystemPrompt = false
-            when(_state.value) {
-                is InferenceEngine.State.Uninitialized -> {}
-                is InferenceEngine.State.Initialized -> shutdown()
-                else -> { unload(); shutdown() }
-            }
-        }
+        _readyForSystemPrompt = false
        llamaScope.cancel()
+        when(_state.value) {
+            is InferenceEngine.State.Uninitialized -> {}
+            is InferenceEngine.State.Initialized -> shutdown()
+            else -> { unload(); shutdown() }
+        }
    }
 }
--- a/examples/model-conversion/CMakeLists.txt
+++ b/examples/model-conversion/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-debug)
-add_executable(${TARGET} debug.cpp)
+set(TARGET llama-logits)
+add_executable(${TARGET} logits.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -25,8 +25,6 @@ define quantize_model
 	@echo "Export the quantized model path to $(2) variable in your environment"
 endef

-DEVICE ?= auto
-
 ###
 ### Casual Model targets/recipes
 ###
@@ -55,13 +53,13 @@ causal-convert-mm-model:

 causal-run-original-model:
 	$(call validate_model_path,causal-run-original-model)
-	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py --device "$(DEVICE)"
+	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py

 causal-run-converted-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh

 causal-verify-logits: causal-run-original-model causal-run-converted-model
-	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
+	@./scripts/causal/compare-logits.py
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}

 causal-run-original-embeddings:
@@ -138,13 +136,16 @@ embedding-run-original-model-st: embedding-run-original-model
 embedding-run-converted-model:
 	@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
-	$(if $(EMBD_NORMALIZE),--embd-normalize "$(EMBD_NORMALIZE)")
+	$(if $(USE_POOLING),--pooling)
+
+embedding-run-converted-model-st: USE_POOLING=1
+embedding-run-converted-model-st: embedding-run-converted-model

 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
 	@./scripts/embedding/compare-embeddings-logits.sh \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

-embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model
+embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
 	@./scripts/embedding/compare-embeddings-logits.sh \
 	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -198,13 +198,14 @@ model, and the other is a text file which allows for manual visual inspection.

 #### Using SentenceTransformer with numbered layers
 For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
-03_Dense, 04_Normalize), these will be applied automatically when running the
-converted model but currently there is a separate target to run the original
-version:
+03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:

 ```console
 # Run original model with SentenceTransformer (applies all numbered layers)
 (venv) $ make embedding-run-original-model-st
+
+# Run converted model with pooling enabled
+(venv) $ make embedding-run-converted-model-st
 ```

 This will use the SentenceTransformer library to load and run the model, which
@@ -212,17 +213,6 @@ automatically applies all the numbered layers in the correct order. This is
 particularly useful when comparing with models that should include these
 additional transformation layers beyond just the base model output.

-The type of normalization can be specified for the converted model but is not
-strictly necessary as the verification uses cosine similarity and the magnitude
-of the output vectors does not affect this. But the normalization type can be
-specified as an argument to the target which might be useful for manual
-inspection:
-```console
-(venv) $ make embedding-verify-logits-st EMBD_NORMALIZE=1
-```
-The original model will apply the normalization according to the normalization
-layer specified in the modules.json configuration file.
-
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model the model can be converted to GGUF format using the following command:
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@@ -0,0 +1,268 @@
+#include "llama.h"
+#include "common.h"
+
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <ctype.h>
+#include <filesystem>
+
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [-pooling] [-embd-norm <norm>] [prompt]\n", argv[0]);
+    printf("\n");
+    printf("  -embd-norm: normalization type for pooled embeddings (default: 2)\n");
+    printf("              -1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm\n");
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    std::string model_path;
+    std::string prompt = "Hello, my name is";
+    int ngl = 0;
+    bool embedding_mode = false;
+    bool pooling_enabled = false;
+    int32_t embd_norm = 2;  // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
+
+    {
+        int i = 1;
+        for (; i < argc; i++) {
+            if (strcmp(argv[i], "-m") == 0) {
+                if (i + 1 < argc) {
+                    model_path = argv[++i];
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-ngl") == 0) {
+                if (i + 1 < argc) {
+                    try {
+                        ngl = std::stoi(argv[++i]);
+                    } catch (...) {
+                        print_usage(argc, argv);
+                        return 1;
+                    }
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-embd-mode") == 0) {
+                embedding_mode = true;
+            } else if (strcmp(argv[i], "-pooling") == 0) {
+                pooling_enabled = true;
+            } else if (strcmp(argv[i], "-embd-norm") == 0) {
+                if (i + 1 < argc) {
+                    try {
+                        embd_norm = std::stoi(argv[++i]);
+                    } catch (...) {
+                        print_usage(argc, argv);
+                        return 1;
+                    }
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else {
+                // prompt starts here
+                break;
+            }
+        }
+
+        if (model_path.empty()) {
+            print_usage(argc, argv);
+            return 1;
+        }
+
+        if (i < argc) {
+            prompt = argv[i++];
+            for (; i < argc; i++) {
+                prompt += " ";
+                prompt += argv[i];
+            }
+        }
+    }
+
+    ggml_backend_load_all();
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = ngl;
+
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    // Extract basename from model_path
+    const char * basename = strrchr(model_path.c_str(), '/');
+    basename = (basename == NULL) ? model_path.c_str() : basename + 1;
+
+    char model_name[256];
+    strncpy(model_name, basename, 255);
+    model_name[255] = '\0';
+
+    char * dot = strrchr(model_name, '.');
+    if (dot != NULL && strcmp(dot, ".gguf") == 0) {
+        *dot = '\0';
+    }
+    printf("Model name: %s\n", model_name);
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+
+    std::vector<llama_token> prompt_tokens(n_prompt);
+    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx = n_prompt;
+    ctx_params.n_batch = n_prompt;
+    ctx_params.no_perf = false;
+    if (embedding_mode) {
+        ctx_params.embeddings = true;
+        ctx_params.pooling_type = pooling_enabled ? LLAMA_POOLING_TYPE_MEAN : LLAMA_POOLING_TYPE_NONE;
+        ctx_params.n_ubatch = ctx_params.n_batch;
+    }
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    printf("Input prompt: \"%s\"\n", prompt.c_str());
+    printf("Tokenized prompt (%d tokens): ", n_prompt);
+    for (auto id : prompt_tokens) {
+        char buf[128];
+        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+            return 1;
+        }
+        std::string s(buf, n);
+        printf("%s (%d)", s.c_str(), id);
+    }
+    printf("\n");
+
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return 1;
+    }
+
+    float * data_ptr;
+    int data_size;
+    const char * type;
+    std::vector<float> embd_out;
+
+    if (embedding_mode) {
+        const int n_embd = llama_model_n_embd(model);
+        const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
+        const int n_embeddings = n_embd * n_embd_count;
+        float * embeddings;
+        type = "-embeddings";
+
+        if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
+            embeddings = llama_get_embeddings_seq(ctx, 0);
+            embd_out.resize(n_embeddings);
+            printf("Normalizing embeddings using norm: %d\n", embd_norm);
+            common_embd_normalize(embeddings, embd_out.data(), n_embeddings, embd_norm);
+            embeddings = embd_out.data();
+        } else {
+            embeddings = llama_get_embeddings(ctx);
+        }
+
+        printf("Embedding dimension: %d\n", n_embd);
+        printf("\n");
+
+        // Print embeddings in the specified format
+        for (int j = 0; j < n_embd_count; j++) {
+            printf("embedding %d: ", j);
+
+            // Print first 3 values
+            for (int i = 0; i < 3 && i < n_embd; i++) {
+                printf("%9.6f ", embeddings[j * n_embd + i]);
+            }
+
+            printf(" ... ");
+
+            // Print last 3 values
+            for (int i = n_embd - 3; i < n_embd; i++) {
+                if (i >= 0) {
+                    printf("%9.6f ", embeddings[j * n_embd + i]);
+                }
+            }
+
+            printf("\n");
+        }
+        printf("\n");
+
+        printf("Embeddings size: %d\n", n_embeddings);
+
+        data_ptr = embeddings;
+        data_size = n_embeddings;
+    } else {
+        float * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+        const int n_logits = llama_vocab_n_tokens(vocab);
+        type = "";
+        printf("Vocab size: %d\n", n_logits);
+
+        data_ptr = logits;
+        data_size = n_logits;
+    }
+
+    std::filesystem::create_directory("data");
+
+    // Save data to binary file
+    char bin_filename[512];
+    snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
+    printf("Saving data to %s\n", bin_filename);
+
+    FILE * f = fopen(bin_filename, "wb");
+    if (f == NULL) {
+        fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
+        return 1;
+    }
+    fwrite(data_ptr, sizeof(float), data_size, f);
+    fclose(f);
+
+    // Also save as text for debugging
+    char txt_filename[512];
+    snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
+    f = fopen(txt_filename, "w");
+    if (f == NULL) {
+        fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
+        return 1;
+    }
+    for (int i = 0; i < data_size; i++) {
+        fprintf(f, "%d: %.6f\n", i, data_ptr[i]);
+    }
+    fclose(f);
+
+    if (!embedding_mode) {
+        printf("First 10 logits: ");
+        for (int i = 0; i < 10 && i < data_size; i++) {
+            printf("%.6f ", data_ptr[i]);
+        }
+        printf("\n");
+
+        printf("Last 10 logits: ");
+        for (int i = data_size - 10; i < data_size; i++) {
+            if (i >= 0) printf("%.6f ", data_ptr[i]);
+        }
+        printf("\n\n");
+    }
+
+    printf("Data saved to %s\n", bin_filename);
+    printf("Data saved to %s\n", txt_filename);
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    return 0;
+}
--- a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
@@ -5,11 +5,8 @@ set -e
 MODEL_PATH="${1:-"$MODEL_PATH"}"
 MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"

-CONVERTED_MODEL_PATH="${1:-"$CONVERTED_MODEL"}"
-CONVERTED_MODEL_NAME="${2:-$(basename "$CONVERTED_MODEL_PATH" ".gguf")}"
-
 if [ -t 0 ]; then
-    CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
+    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
 else
    # Process piped JSON data and convert to binary (matching logits.cpp format)
    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -3,11 +3,10 @@
 import sys
 import numpy as np
 from pathlib import Path
-import os

 # Add utils directory to path for direct script execution
 sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
-from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found]
+from common import get_model_name_from_env_path  # type: ignore[import-not-found]

 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
@@ -39,7 +38,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
    return True

 def main():
-    model_path = os.environ.get('MODEL_PATH')
    model_name = get_model_name_from_env_path('MODEL_PATH')
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
@@ -60,12 +58,6 @@ def main():

    print("Checked all required files were found. Proceeding...\n")

-    # Verify tokens as they are a prerequisite for logits comparison.
-    print("🔍 Token Comparison Check")
-    print("=" * 40)
-    if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
-        exit_with_warning("\n❌ Token mismatch detected", model_path)
-    print()

    print("🔍 GGML Model Validation for model ", model_name)
    print("=" * 40)
@@ -81,7 +73,8 @@ def main():
        print("       Ok to proceed with NMSE check...")
        sys.exit(0)
    else:
-        exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)
+        print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
+        sys.exit(1)

 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/causal/modelcard.template
+++ b/examples/model-conversion/scripts/causal/modelcard.template
@@ -7,7 +7,7 @@ base_model:
 Recommended way to run this model:

 ```sh
-llama-server -hf {namespace}/{model_name}-GGUF
+llama-server -hf {namespace}/{model_name}-GGUF -c 0
 ```

 Then, access http://localhost:8080
--- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
+++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
@@ -67,7 +67,7 @@ with torch.no_grad():
    last_hidden_states = outputs.hidden_states[-1]

    # Get embeddings for all tokens
-    token_embeddings = last_hidden_states[0].float().cpu().numpy()  # Remove batch dimension
+    token_embeddings = last_hidden_states[0].cpu().numpy()  # Remove batch dimension

    print(f"Hidden states shape: {last_hidden_states.shape}")
    print(f"Token embeddings shape: {token_embeddings.shape}")
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@@ -13,6 +13,6 @@ if [ -z "$CONVERTED_MODEL" ]; then
    exit 1
 fi

-cmake --build ../../build --target llama-debug -j8
+cmake --build ../../build --target llama-logits -j8

-../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
+../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@@ -21,6 +21,6 @@ fi
 echo $CONVERTED_MODEL
 echo $MODEL_TESTING_PROMPT

-cmake --build ../../build --target llama-debug -j8
+cmake --build ../../build --target llama-logits -j8

-../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
+../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -4,165 +4,149 @@ import argparse
 import os
 import sys
 import importlib
-import torch
-import numpy as np
-
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
+from pathlib import Path

 # Add parent directory to path for imports
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-from utils.common import debug_hook, save_output_data

-def parse_arguments():
-    parser = argparse.ArgumentParser(description="Process model with specified path")
-    parser.add_argument("--model-path", "-m", help="Path to the model")
-    parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
-    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
-    parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
-    return parser.parse_args()
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
+import torch
+import numpy as np
+from utils.common import debug_hook

-def load_model_and_tokenizer(model_path, device="auto"):
-    print("Loading model and tokenizer using AutoTokenizer:", model_path)
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    multimodal = False
-    full_config = config
+parser = argparse.ArgumentParser(description="Process model with specified path")
+parser.add_argument("--model-path", "-m", help="Path to the model")
+parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
+parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
+args = parser.parse_args()

-    # Determine device_map based on device argument
-    if device == "cpu":
-        device_map = {"": "cpu"}
-        print("Forcing CPU usage")
-    elif device == "auto":
-        device_map = "auto"
-    else:
-        device_map = {"": device}
+model_path = os.environ.get("MODEL_PATH", args.model_path)
+if model_path is None:
+    parser.error(
+        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
+    )

-    print("Model type:       ", config.model_type)
-    if "vocab_size" not in config and "text_config" in config:
-        config = config.text_config
-        multimodal = True
+### If you want to dump RoPE activations, uncomment the following lines:
+### === START ROPE DEBUG ===
+# from utils.common import setup_rope_debug
+# setup_rope_debug("transformers.models.apertus.modeling_apertus")
+### == END ROPE DEBUG ===

-    print("Vocab size:       ", config.vocab_size)
-    print("Hidden size:      ", config.hidden_size)
-    print("Number of layers: ", config.num_hidden_layers)
-    print("BOS token id:     ", config.bos_token_id)
-    print("EOS token id:     ", config.eos_token_id)

-    unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
-    if unreleased_model_name:
-        model_name_lower = unreleased_model_name.lower()
-        unreleased_module_path = (
-            f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+print("Loading model and tokenizer using AutoTokenizer:", model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+multimodal = False
+full_config = config
+
+print("Model type:       ", config.model_type)
+if "vocab_size" not in config and "text_config" in config:
+    config = config.text_config
+    multimodal = True
+print("Vocab size:       ", config.vocab_size)
+print("Hidden size:      ", config.hidden_size)
+print("Number of layers: ", config.num_hidden_layers)
+print("BOS token id:     ", config.bos_token_id)
+print("EOS token id:     ", config.eos_token_id)
+
+unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
+if unreleased_model_name:
+    model_name_lower = unreleased_model_name.lower()
+    unreleased_module_path = (
+        f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+    )
+    class_name = f"{unreleased_model_name}ForCausalLM"
+    print(f"Importing unreleased model module: {unreleased_module_path}")
+
+    try:
+        model_class = getattr(
+            importlib.import_module(unreleased_module_path), class_name
+        )
+        model = model_class.from_pretrained(
+            model_path
+        )  # Note: from_pretrained, not fromPretrained
+    except (ImportError, AttributeError) as e:
+        print(f"Failed to import or load model: {e}")
+        exit(1)
+else:
+    if multimodal:
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
        )
-        class_name = f"{unreleased_model_name}ForCausalLM"
-        print(f"Importing unreleased model module: {unreleased_module_path}")
-
-        try:
-            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-            model = model_class.from_pretrained(
-                    model_path,
-                    device_map=device_map,
-                    offload_folder="offload",
-                    trust_remote_code=True,
-                    config=config
-            )
-        except (ImportError, AttributeError) as e:
-            print(f"Failed to import or load model: {e}")
-            exit(1)
    else:
-        if multimodal:
-            model = AutoModelForImageTextToText.from_pretrained(
-                    model_path,
-                    device_map=device_map,
-                    offload_folder="offload",
-                    trust_remote_code=True,
-                    config=full_config
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    device_map=device_map,
-                    offload_folder="offload",
-                    trust_remote_code=True,
-                    config=config
-            )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
+        )

-    print(f"Model class: {model.__class__.__name__}")
+if args.verbose:
+    for name, module in model.named_modules():
+        if len(list(module.children())) == 0:  # only leaf modules
+            module.register_forward_hook(debug_hook(name))

-    return model, tokenizer, config
+model_name = os.path.basename(model_path)
+# Printing the Model class to allow for easier debugging. This can be useful
+# when working with models that have not been publicly released yet and this
+# migth require that the concrete class is imported and used directly instead
+# of using AutoModelForCausalLM.
+print(f"Model class: {model.__class__.__name__}")

-def enable_torch_debugging(model):
-        for name, module in model.named_modules():
-            if len(list(module.children())) == 0:  # only leaf modules
-                module.register_forward_hook(debug_hook(name))
+device = next(model.parameters()).device
+if args.prompt_file:
+    with open(args.prompt_file, encoding='utf-8') as f:
+        prompt = f.read()
+elif os.getenv("MODEL_TESTING_PROMPT"):
+    prompt = os.getenv("MODEL_TESTING_PROMPT")
+else:
+    prompt = "Hello, my name is"
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

-def get_prompt(args):
-    if args.prompt_file:
-        with open(args.prompt_file, encoding='utf-8') as f:
-            return f.read()
-    elif os.getenv("MODEL_TESTING_PROMPT"):
-        return os.getenv("MODEL_TESTING_PROMPT")
-    else:
-        return "Hello, my name is"
+print(f"Input tokens: {input_ids}")
+print(f"Input text: {repr(prompt)}")
+print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")

-def main():
-    args = parse_arguments()
-    model_path = os.environ.get("MODEL_PATH", args.model_path)
-    if model_path is None:
-        print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
-        sys.exit(1)
+batch_size = 512

+with torch.no_grad():
+    past = None
+    outputs = None
+    for i in range(0, input_ids.size(1), batch_size):
+        print(f"Processing chunk with tokens {i} to {i + batch_size}")
+        chunk = input_ids[:, i:i + batch_size]
+        outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
+        past = outputs.past_key_values

-    model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)
+    logits = outputs.logits # type: ignore

-    if args.verbose:
-        enable_torch_debugging(model)
+    # Extract logits for the last token (next token prediction)
+    last_logits = logits[0, -1, :].float().cpu().numpy()

-    model_name = os.path.basename(model_path)
+    print(f"Logits shape: {logits.shape}")
+    print(f"Last token logits shape: {last_logits.shape}")
+    print(f"Vocab size: {len(last_logits)}")

-    # Iterate over the model parameters (the tensors) and get the first one
-    # and use it to get the device the model is on.
-    device = next(model.parameters()).device
-    prompt = get_prompt(args)
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-    token_ids = input_ids[0].cpu().tolist()
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    bin_filename = data_dir / f"pytorch-{model_name}.bin"
+    txt_filename = data_dir / f"pytorch-{model_name}.txt"

-    print(f"Input tokens: {input_ids}")
-    print(f"Input text: {repr(prompt)}")
-    print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
+    # Save to file for comparison
+    last_logits.astype(np.float32).tofile(bin_filename)

-    batch_size = 512
+    # Also save as text file for easy inspection
+    with open(txt_filename, "w") as f:
+        for i, logit in enumerate(last_logits):
+            f.write(f"{i}: {logit:.6f}\n")

-    with torch.no_grad():
-        past = None
-        outputs = None
-        for i in range(0, input_ids.size(1), batch_size):
-            print(f"Processing chunk with tokens {i} to {i + batch_size}")
-            chunk = input_ids[:, i:i + batch_size]
-            outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
-            past = outputs.past_key_values
+    # Print some sample logits for quick verification
+    print(f"First 10 logits: {last_logits[:10]}")
+    print(f"Last 10 logits: {last_logits[-10:]}")

-        logits = outputs.logits # type: ignore
+    # Show top 5 predicted tokens
+    top_indices = np.argsort(last_logits)[-5:][::-1]
+    print("Top 5 predictions:")
+    for idx in top_indices:
+        token = tokenizer.decode([idx])
+        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")

-        # Extract logits for the last token (next token prediction)
-        last_logits = logits[0, -1, :].float().cpu().numpy()
-
-        print(f"Logits shape: {logits.shape}")
-        print(f"Last token logits shape: {last_logits.shape}")
-        print(f"Vocab size: {len(last_logits)}")
-
-        # Print some sample logits for quick verification
-        print(f"First 10 logits: {last_logits[:10]}")
-        print(f"Last 10 logits: {last_logits[-10:]}")
-
-        # Show top 5 predicted tokens
-        top_indices = np.argsort(last_logits)[-5:][::-1]
-        print("Top 5 predictions:")
-        for idx in top_indices:
-            token = tokenizer.decode([idx])
-            print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
-
-        save_output_data(last_logits, token_ids, prompt, model_name)
-
-if __name__ == "__main__":
-    main()
+    print(f"Saved bin logits to: {bin_filename}")
+    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@@ -5,7 +5,7 @@ set -e
 # Parse command line arguments
 CONVERTED_MODEL=""
 PROMPTS_FILE=""
-EMBD_NORMALIZE="2"
+USE_POOLING=""

 while [[ $# -gt 0 ]]; do
    case $1 in
@@ -13,9 +13,9 @@ while [[ $# -gt 0 ]]; do
            PROMPTS_FILE="$2"
            shift 2
            ;;
-        --embd-normalize)
-            EMBD_NORMALIZE="$2"
-            shift 2
+        --pooling)
+            USE_POOLING="1"
+            shift
            ;;
        *)
            if [ -z "$CONVERTED_MODEL" ]; then
@@ -50,5 +50,10 @@ fi

 echo $CONVERTED_MODEL

-cmake --build ../../build --target llama-debug -j8
-../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
+cmake --build ../../build --target llama-logits -j8
+# TODO: update logits.cpp to accept a --file/-f option for the prompt
+if [ -n "$USE_POOLING" ]; then
+    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
+else
+    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
+fi
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -2,242 +2,176 @@

 import argparse
 import os
-import sys
+import numpy as np
 import importlib
+from pathlib import Path

 from transformers import AutoTokenizer, AutoConfig, AutoModel
 import torch

-# Add parent directory to path for imports
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-from utils.common import save_output_data
+unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

+parser = argparse.ArgumentParser(description='Process model with specified path')
+parser.add_argument('--model-path', '-m', help='Path to the model')
+parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
+parser.add_argument('--use-sentence-transformers', action='store_true',
+                    help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
+args = parser.parse_args()

-def parse_arguments():
-    parser = argparse.ArgumentParser(description='Run original embedding model')
-    parser.add_argument(
-        '--model-path',
-        '-m',
-        help='Path to the model'
-    )
-    parser.add_argument(
-        '--prompts-file',
-        '-p',
-        help='Path to file containing prompts (one per line)'
-    )
-    parser.add_argument(
-        '--use-sentence-transformers',
-        action='store_true',
-        help=('Use SentenceTransformer to apply all numbered layers '
-              '(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
-    )
-    parser.add_argument(
-        '--device',
-        '-d',
-        help='Device to use (cpu, cuda, mps, auto)',
-        default='auto'
-    )
-    return parser.parse_args()
+def read_prompt_from_file(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: Prompts file '{file_path}' not found")
+        exit(1)
+    except Exception as e:
+        print(f"Error reading prompts file: {e}")
+        exit(1)

+model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
+if model_path is None:
+    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")

-def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
-    if device == "cpu":
-        device_map = {"": "cpu"}
-        print("Forcing CPU usage")
-    elif device == "auto":
-        # On Mac, "auto" device_map can cause issues with accelerate
-        # So we detect the best device manually
-        if torch.cuda.is_available():
-            device_map = {"": "cuda"}
-            print("Using CUDA")
-        elif torch.backends.mps.is_available():
-            device_map = {"": "mps"}
-            print("Using MPS (Apple Metal)")
-        else:
-            device_map = {"": "cpu"}
-            print("Using CPU")
-    else:
-        device_map = {"": device}
+# Determine if we should use SentenceTransformer
+use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')

-    if use_sentence_transformers:
-        from sentence_transformers import SentenceTransformer
-        print("Using SentenceTransformer to apply all numbered layers")
-        model = SentenceTransformer(model_path)
-        tokenizer = model.tokenizer
-        config = model[0].auto_model.config  # type: ignore
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+if use_sentence_transformers:
+    from sentence_transformers import SentenceTransformer
+    print("Using SentenceTransformer to apply all numbered layers")
+    model = SentenceTransformer(model_path)
+    tokenizer = model.tokenizer
+    config = model[0].auto_model.config  # type: ignore
+else:
+    tokenizer = AutoTokenizer.from_pretrained(model_path)

-        # This can be used to override the sliding window size for manual testing. This
-        # can be useful to verify the sliding window attention mask in the original model
-        # and compare it with the converted .gguf model.
-        if hasattr(config, 'sliding_window'):
-            original_sliding_window = config.sliding_window
-            print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
+    config = AutoConfig.from_pretrained(model_path)

-        unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
-        print(f"Using unreleased model: {unreleased_model_name}")
-        if unreleased_model_name:
-            model_name_lower = unreleased_model_name.lower()
-            unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-            class_name = f"{unreleased_model_name}Model"
-            print(f"Importing unreleased model module: {unreleased_module_path}")
+    # This can be used to override the sliding window size for manual testing. This
+    # can be useful to verify the sliding window attention mask in the original model
+    # and compare it with the converted .gguf model.
+    if hasattr(config, 'sliding_window'):
+        original_sliding_window = config.sliding_window
+        #original_sliding_window = 6
+        print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")

-            try:
-                model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-                model = model_class.from_pretrained(
-                    model_path,
-                    device_map=device_map,
-                    offload_folder="offload",
-                    trust_remote_code=True,
-                    config=config
-                )
-            except (ImportError, AttributeError) as e:
-                print(f"Failed to import or load model: {e}")
-                sys.exit(1)
-        else:
-            model = AutoModel.from_pretrained(
-                model_path,
-                device_map=device_map,
-                offload_folder="offload",
-                trust_remote_code=True,
-                config=config
-            )
-        print(f"Model class: {type(model)}")
-        print(f"Model file: {type(model).__module__}")
+    print(f"Using unreleased model: {unreleased_model_name}")
+    if unreleased_model_name:
+        model_name_lower = unreleased_model_name.lower()
+        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+        class_name = f"{unreleased_model_name}Model"
+        print(f"Importing unreleased model module: {unreleased_module_path}")

-        # Verify the model is using the correct sliding window
-        if hasattr(model.config, 'sliding_window'):  # type: ignore
-            print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
-        else:
-            print("Model config does not have sliding_window attribute")
-
-    return model, tokenizer, config
-
-
-def get_prompt(args):
-    if args.prompts_file:
        try:
-            with open(args.prompts_file, 'r', encoding='utf-8') as f:
-                return f.read().strip()
-        except FileNotFoundError:
-            print(f"Error: Prompts file '{args.prompts_file}' not found")
-            sys.exit(1)
-        except Exception as e:
-            print(f"Error reading prompts file: {e}")
-            sys.exit(1)
+            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+            model = model_class.from_pretrained(model_path, config=config)
+        except (ImportError, AttributeError) as e:
+            print(f"Failed to import or load model: {e}")
+            exit(1)
    else:
-        return "Hello world today"
+        model = AutoModel.from_pretrained(model_path, config=config)
+    print(f"Model class: {type(model)}")
+    print(f"Model file: {type(model).__module__}")

-
-def main():
-    args = parse_arguments()
-
-    model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
-    if model_path is None:
-        print("Error: Model path must be specified either via --model-path argument "
-              "or EMBEDDING_MODEL_PATH environment variable")
-        sys.exit(1)
-
-    # Determine if we should use SentenceTransformer
-    use_st = (
-        args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
-    )
-
-    model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)
-
-    # Get the device the model is on
-    if not use_st:
-        device = next(model.parameters()).device
+# Verify the model is using the correct sliding window
+if not use_sentence_transformers:
+    if hasattr(model.config, 'sliding_window'):  # type: ignore
+        print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
    else:
-        # For SentenceTransformer, get device from the underlying model
-        device = next(model[0].auto_model.parameters()).device  # type: ignore
+        print("Model config does not have sliding_window attribute")

-    model_name = os.path.basename(model_path)
+model_name = os.path.basename(model_path)

-    prompt_text = get_prompt(args)
+if args.prompts_file:
+    prompt_text = read_prompt_from_file(args.prompts_file)
    texts = [prompt_text]
+else:
+    texts = ["Hello world today"]

-    with torch.no_grad():
-        if use_st:
-            embeddings = model.encode(texts, convert_to_numpy=True)
-            all_embeddings = embeddings  # Shape: [batch_size, hidden_size]
+with torch.no_grad():
+    if use_sentence_transformers:
+        embeddings = model.encode(texts, convert_to_numpy=True)
+        all_embeddings = embeddings  # Shape: [batch_size, hidden_size]

-            encoded = tokenizer(
-                texts,
-                padding=True,
-                truncation=True,
-                return_tensors="pt"
-            )
-            tokens = encoded['input_ids'][0]
-            token_ids = tokens.cpu().tolist()
-            token_strings = tokenizer.convert_ids_to_tokens(tokens)
-            for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-                print(f"{token_id:6d} -> '{token_str}'")
+        encoded = tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        tokens = encoded['input_ids'][0]
+        token_strings = tokenizer.convert_ids_to_tokens(tokens)
+        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+            print(f"{token_id:6d} -> '{token_str}'")

-            print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
-            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
-        else:
-            # Standard approach: use base model output only
-            encoded = tokenizer(
-                texts,
-                padding=True,
-                truncation=True,
-                return_tensors="pt"
-            )
+        print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
+        print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
+    else:
+        # Standard approach: use base model output only
+        encoded = tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )

-            tokens = encoded['input_ids'][0]
-            token_ids = tokens.cpu().tolist()
-            token_strings = tokenizer.convert_ids_to_tokens(tokens)
-            for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-                print(f"{token_id:6d} -> '{token_str}'")
+        tokens = encoded['input_ids'][0]
+        token_strings = tokenizer.convert_ids_to_tokens(tokens)
+        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+            print(f"{token_id:6d} -> '{token_str}'")

-            # Move inputs to the same device as the model
-            encoded = {k: v.to(device) for k, v in encoded.items()}
-            outputs = model(**encoded)
-            hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
+        outputs = model(**encoded)
+        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]

-            all_embeddings = hidden_states[0].float().cpu().numpy()  # Shape: [seq_len, hidden_size]
+        all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]

-            print(f"Hidden states shape: {hidden_states.shape}")
-            print(f"All embeddings shape: {all_embeddings.shape}")
-            print(f"Embedding dimension: {all_embeddings.shape[1]}")
+        print(f"Hidden states shape: {hidden_states.shape}")
+        print(f"All embeddings shape: {all_embeddings.shape}")
+        print(f"Embedding dimension: {all_embeddings.shape[1]}")

-        if len(all_embeddings.shape) == 1:
-            n_embd = all_embeddings.shape[0]  # type: ignore
-            n_embd_count = 1
-            all_embeddings = all_embeddings.reshape(1, -1)
-        else:
-            n_embd = all_embeddings.shape[1]  # type: ignore
-            n_embd_count = all_embeddings.shape[0]  # type: ignore
+    if len(all_embeddings.shape) == 1:
+        n_embd = all_embeddings.shape[0]  # type: ignore
+        n_embd_count = 1
+        all_embeddings = all_embeddings.reshape(1, -1)
+    else:
+        n_embd = all_embeddings.shape[1]  # type: ignore
+        n_embd_count = all_embeddings.shape[0]  # type: ignore

-        print()
+    print()

+    for j in range(n_embd_count):
+        embedding = all_embeddings[j]
+        print(f"embedding {j}: ", end="")
+
+        # Print first 3 values
+        for i in range(min(3, n_embd)):
+            print(f"{embedding[i]:9.6f} ", end="")
+
+        print(" ... ", end="")
+
+        # Print last 3 values
+        for i in range(n_embd - 3, n_embd):
+            print(f"{embedding[i]:9.6f} ", end="")
+
+        print()  # New line
+
+    print()
+
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
+    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
+
+    flattened_embeddings = all_embeddings.flatten()
+    flattened_embeddings.astype(np.float32).tofile(bin_filename)
+
+    with open(txt_filename, "w") as f:
+        idx = 0
        for j in range(n_embd_count):
-            embedding = all_embeddings[j]
-            print(f"embedding {j}: ", end="")
-
-            # Print first 3 values
-            for i in range(min(3, n_embd)):
-                print(f"{embedding[i]:9.6f} ", end="")
-
-            print(" ... ", end="")
-
-            # Print last 3 values
-            for i in range(n_embd - 3, n_embd):
-                print(f"{embedding[i]:9.6f} ", end="")
-
-            print()  # New line
-
-        print()
-
-        flattened_embeddings = all_embeddings.flatten()
-        print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
-        print("")
-
-        save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings")
-
-
-if __name__ == "__main__":
-    main()
+            for value in all_embeddings[j]:
+                f.write(f"{idx}: {value:.6f}\n")
+                idx += 1
+    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
+    print("")
+    print(f"Saved bin embeddings to: {bin_filename}")
+    print(f"Saved txt embeddings to: {txt_filename}")
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@@ -3,11 +3,6 @@
 import os
 import sys
 import torch
-import transformers
-import json
-import textwrap
-import numpy as np
-from pathlib import Path


 def get_model_name_from_env_path(env_path_name):
@@ -153,147 +148,3 @@ def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_
    # Patch it
    setattr(module, function_name, debug_rope)
    print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
-
-
-def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
-    """
-    Save output data (logits/embeddings), tokens, and prompt to files.
-
-    Args:
-        data:        numpy array of floats (logits or embeddings)
-        tokens:      list or array of token IDs
-        prompt:      string containing the input prompt
-        model_name:  name of the model
-        type_suffix: optional suffix like "-embeddings" (default: "")
-        output_dir:  directory to save files (default: "data")
-
-    Creates the following files in output_dir:
-        - pytorch-{model_name}{type_suffix}.bin
-        - pytorch-{model_name}{type_suffix}.txt
-        - pytorch-{model_name}{type_suffix}-prompt.txt
-        - pytorch-{model_name}{type_suffix}-tokens.bin
-    """
-    data_dir = Path(output_dir)
-    data_dir.mkdir(exist_ok=True)
-    base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
-
-    # Convert and flatten logits/embeddings
-    data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
-    data = data.flatten() if data.ndim > 1 else data
-
-    # Save logits/embedding files
-    data.astype(np.float32).tofile(f"{base_path}.bin")
-    print(f"Data saved to {base_path}.bin")
-
-    with open(f"{base_path}.txt", "w") as f:
-        f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
-    print(f"Data saved to {base_path}.txt")
-
-    # Convert and flatten tokens
-    tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
-    tokens = tokens.flatten() if tokens.ndim > 1 else tokens
-
-    # Save token binary file
-    tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
-    print(f"Tokens saved to {base_path}-tokens.bin")
-
-    # Save prompt file
-    with open(f"{base_path}-prompt.txt", "w") as f:
-        f.write(f"prompt: {prompt}\n")
-        f.write(f"n_tokens: {len(tokens)}\n")
-        f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
-    print(f"Prompt saved to {base_path}-prompt.txt")
-
-
-def compare_tokens(original, converted, type_suffix="", output_dir="data"):
-    data_dir = Path(output_dir)
-
-    # Read tokens from both models
-    tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
-    tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
-
-    if not tokens1_file.exists():
-        print(f"Error: Token file not found: {tokens1_file}")
-        return False
-
-    if not tokens2_file.exists():
-        print(f"Error: Token file not found: {tokens2_file}")
-        return False
-
-    tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
-    tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
-
-    print(f"\nComparing tokens between:")
-    print(f"  Original : {original} ({len(tokens1)} tokens)")
-    print(f"  Converted: {converted} ({len(tokens2)} tokens)")
-
-    if len(tokens1) != len(tokens2):
-        print(f"\n❌ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
-        return False
-
-    if np.array_equal(tokens1, tokens2):
-        print(f"\n✅ All {len(tokens1)} tokens match!")
-        return True
-
-    mismatches = np.where(tokens1 != tokens2)[0]
-    print(f"\n❌ Found {len(mismatches)} mismatched tokens:")
-
-    num_to_show = min(len(mismatches), 10)
-    for idx in mismatches[:num_to_show]:
-        print(f"  Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
-
-    if len(mismatches) > num_to_show:
-        print(f"  ... and {len(mismatches) - num_to_show} more mismatches")
-
-    return False
-
-
-def show_version_warning(current_version, model_version):
-    if not model_version:
-        return False
-
-    try:
-        from packaging.version import parse, InvalidVersion
-        try:
-            return parse(current_version) < parse(model_version)
-        except InvalidVersion:
-            return current_version != model_version
-    except ImportError:
-        return current_version != model_version
-
-def get_model_transformers_version(model_path):
-    if not model_path:
-        return None
-
-    config_path = Path(model_path) / "config.json"
-    if not config_path.is_file():
-        return None
-
-    try:
-        with open(config_path, "r", encoding="utf-8") as f:
-            config = json.load(f)
-        return config.get("transformers_version")
-    except (IOError, json.JSONDecodeError) as e:
-        print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
-        return None
-
-def exit_with_warning(message, model_path):
-    print(message)
-
-    if model_path and transformers is not None:
-        model_transformers_version = get_model_transformers_version(model_path)
-        transformers_version       = transformers.__version__
-        if show_version_warning(transformers_version, model_transformers_version):
-            warning_message = f"""
-                =====================================================================
-                Verification failure might be due to a transformers version mismatch:
-
-                Current transformers version: {transformers_version}
-                Model's required version    : {model_transformers_version}
-
-                Consider installing the version specified by the model's config:
-                pip install transformers=={model_transformers_version}
-                =====================================================================
-            """
-            print(textwrap.dedent(warning_message))
-    sys.exit(1)
--- a/examples/model-conversion/scripts/utils/compare_tokens.py
+++ b/examples/model-conversion/scripts/utils/compare_tokens.py
@@ -1,76 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import sys
-from common import compare_tokens  # type: ignore
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description='Compare tokens between two models',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  %(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
-        """
-    )
-    parser.add_argument(
-        'original',
-        help='Original model name'
-    )
-    parser.add_argument(
-        'converted',
-        help='Converted model name'
-    )
-    parser.add_argument(
-        '-s', '--suffix',
-        default='',
-        help='Type suffix (e.g., "-embeddings")'
-    )
-    parser.add_argument(
-        '-d', '--data-dir',
-        default='data',
-        help='Directory containing token files (default: data)'
-    )
-    parser.add_argument(
-        '-v', '--verbose',
-        action='store_true',
-        help='Print prompts from both models'
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_arguments()
-
-    if args.verbose:
-        from pathlib import Path
-        data_dir = Path(args.data_dir)
-
-        prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
-        prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
-
-        if prompt1_file.exists():
-            print(f"\nOriginal model prompt ({args.original}):")
-            print(f"  {prompt1_file.read_text().strip()}")
-
-        if prompt2_file.exists():
-            print(f"\nConverted model prompt ({args.converted}):")
-            print(f"  {prompt2_file.read_text().strip()}")
-
-        print()
-
-    result = compare_tokens(
-        args.original,
-        args.converted,
-        type_suffix=args.suffix,
-        output_dir=args.data_dir
-    )
-
-    # Enable the script to be used in shell scripts so that they can check
-    # the exit code for success/failure.
-    sys.exit(0 if result else 1)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -4,10 +4,8 @@ import numpy as np
 import argparse
 import os
 import importlib
-from pathlib import Path

 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
-from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found]

 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

@@ -159,31 +157,16 @@ def main():
    else:
        prompt = args.prompt

-    python_emb_path = Path(args.python_embeddings)
-    cpp_emb_path = Path(args.cpp_embeddings)
-
-    # Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
-    python_model_name = python_emb_path.stem.replace("-embeddings", "")
-    cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
-
    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
    print("=" * 70)

-    # First verify tokens match before comparing embeddings
-    print("\n🔍 Token Comparison Check")
-    print("=" * 70)
-    data_dir = python_emb_path.parent
-    if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
-        exit_with_warning("\n❌ Token mismatch detected", args.model_path)
-    print()
-
    # Single prompt detailed comparison
    print(f"\nTesting with prompt: '{prompt}'")

    # Load the python model to get configuration information and also to load the tokenizer.
    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(args.model_path)

    if unreleased_model_name:
        model_name_lower = unreleased_model_name.lower()
@@ -203,9 +186,9 @@ def main():
            exit(1)
    else:
        if args.causal:
-            model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(args.model_path)
        else:
-            model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
+            model = AutoModel.from_pretrained(args.model_path)

    encoded = tokenizer(prompt, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
@@ -236,7 +219,7 @@ def main():
    elif avg_cross_sim > 0.70:
        print("⚠️  FAIR: Models have some differences")
    else:
-        exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
+        print("❌ POOR: Models are significantly different")

 if __name__ == "__main__":
    main()
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -217,13 +217,13 @@ int main(int argc, char ** argv) {
    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);

    // allocate output
-    const int n_embd_out = llama_model_n_embd_out(model);
-    std::vector<float> embeddings(n_chunks * n_embd_out, 0);
+    const int n_embd = llama_model_n_embd(model);
+    std::vector<float> embeddings(n_chunks * n_embd, 0);
    float * emb = embeddings.data();

    // break into batches
-    unsigned int p = 0; // number of prompts processed already
-    unsigned int s = 0; // number of prompts in current batch
+    int p = 0; // number of prompts processed already
+    int s = 0; // number of prompts in current batch
    for (int k = 0; k < n_chunks; k++) {
        // clamp to n_batch tokens
        auto & inp = chunks[k].tokens;
@@ -231,9 +231,9 @@ int main(int argc, char ** argv) {
        const uint64_t n_toks = inp.size();

        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
-            float * out = emb + p * n_embd_out;
-            batch_process(ctx, batch, out, s, n_embd_out);
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + p * n_embd;
+            batch_process(ctx, batch, out, s, n_embd);
            common_batch_clear(batch);
            p += s;
            s = 0;
@@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
    }

    // final batch
-    float * out = emb + p * n_embd_out;
-    batch_process(ctx, batch, out, s, n_embd_out);
+    float * out = emb + p * n_embd;
+    batch_process(ctx, batch, out, s, n_embd);

    // save embeddings to chunks
    for (int i = 0; i < n_chunks; i++) {
-        chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
        // clear tokens as they are no longer needed
        chunks[i].tokens.clear();
    }
@@ -266,8 +266,8 @@ int main(int argc, char ** argv) {

        batch_add_seq(query_batch, query_tokens, 0);

-        std::vector<float> query_emb(n_embd_out, 0);
-        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
+        std::vector<float> query_emb(n_embd, 0);
+        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);

        common_batch_clear(query_batch);

@@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
        {
            std::vector<std::pair<int, float>> similarities;
            for (int i = 0; i < n_chunks; i++) {
-                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
+                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
                similarities.push_back(std::make_pair(i, sim));
            }

--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference

 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF

 #build example/main
 #cmake --build . --config Release --target main
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -22,9 +22,9 @@ if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none

 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -24,8 +24,8 @@ export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON

 ::  for FP32
-cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR

 ::  build all binary
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1

-.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1

-.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
+.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99