sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2,11 +2,10 @@

 #include "chat.h"
 #include "common.h"
-#include "download.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
-#include "preset.h"
+#include "download.h"

 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -48,8 +47,6 @@

 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

-extern const char * LICENSES[];
-
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;

@@ -271,55 +268,6 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
    }
 }

-static std::string clean_file_name(const std::string & fname) {
-    std::string clean_fname = fname;
-    string_replace_all(clean_fname, "\\", "_");
-    string_replace_all(clean_fname, "/", "_");
-    return clean_fname;
-}
-
-static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
-    GGML_ASSERT(!params.model.hf_repo.empty());
-
-    // the returned hf_repo is without tag
-    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
-
-    // "latest" tag (default if not specified) is translated to "default" preset
-    if (hf_tag == "latest") {
-        hf_tag = "default";
-    }
-
-    const bool offline = params.offline;
-    std::string model_endpoint = get_model_endpoint();
-    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
-
-    // prepare local path for caching
-    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
-    auto preset_path = fs_get_cache_file(preset_fname);
-    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
-    const bool has_preset = status >= 200 && status < 400;
-
-    // remote preset is optional, so we don't error out if not found
-    if (has_preset) {
-        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
-        common_preset_context ctx(ex, /* only_remote_allowed */ true);
-        common_preset global;
-        auto remote_presets = ctx.load_from_ini(preset_path, global);
-        remote_presets = ctx.cascade(global, remote_presets);
-        if (remote_presets.find(hf_tag) != remote_presets.end()) {
-            common_preset preset = remote_presets.at(hf_tag);
-            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
-            preset.apply_to_params(params);
-        } else {
-            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
-        }
-    } else {
-        LOG_INF("%s", "no remote preset found, skipping\n");
-    }
-
-    return has_preset;
-}
-
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
@@ -341,7 +289,7 @@ static handle_model_result common_params_handle_model(
                if (model.path.empty()) {
                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
-                        exit(1); // error message already printed
+                        exit(1); // built without CURL, error message already printed
                    }
                    model.name    = model.hf_repo;      // repo name with tag
                    model.hf_repo = auto_detected.repo; // repo name without tag
@@ -361,7 +309,9 @@ static handle_model_result common_params_handle_model(
            // make sure model path is present (for caching purposes)
            if (model.path.empty()) {
                // this is to avoid different repo having same file name, or same file name in different subdirs
-                std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
+                std::string filename = model.hf_repo + "_" + model.hf_file;
+                // to make sure we don't have any slashes in the filename
+                string_replace_all(filename, "/", "_");
                model.path = fs_get_cache_file(filename);
            }

@@ -475,87 +425,61 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        }
    };

-    auto parse_cli_args = [&]() {
-        std::set<std::string> seen_args;
+    std::set<std::string> seen_args;

-        for (int i = 1; i < argc; i++) {
-            const std::string arg_prefix = "--";
+    for (int i = 1; i < argc; i++) {
+        const std::string arg_prefix = "--";

-            std::string arg = argv[i];
-            if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-                std::replace(arg.begin(), arg.end(), '_', '-');
-            }
-            if (arg_to_options.find(arg) == arg_to_options.end()) {
-                throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
-            }
-            if (!seen_args.insert(arg).second) {
-                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
-            }
-            auto & tmp = arg_to_options[arg];
-            auto opt = *tmp.first;
-            bool is_positive = tmp.second;
-            if (opt.has_value_from_env()) {
-                fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
-            }
-            try {
-                if (opt.handler_void) {
-                    opt.handler_void(params);
-                    continue;
-                }
-                if (opt.handler_bool) {
-                    opt.handler_bool(params, is_positive);
-                    continue;
-                }
-
-                // arg with single value
-                check_arg(i);
-                std::string val = argv[++i];
-                if (opt.handler_int) {
-                    opt.handler_int(params, std::stoi(val));
-                    continue;
-                }
-                if (opt.handler_string) {
-                    opt.handler_string(params, val);
-                    continue;
-                }
-
-                // arg with 2 values
-                check_arg(i);
-                std::string val2 = argv[++i];
-                if (opt.handler_str_str) {
-                    opt.handler_str_str(params, val, val2);
-                    continue;
-                }
-            } catch (std::exception & e) {
-                throw std::invalid_argument(string_format(
-                    "error while handling argument \"%s\": %s\n\n"
-                    "usage:\n%s\n\nto show complete usage, run with -h",
-                    arg.c_str(), e.what(), opt.to_string().c_str()));
-            }
+        std::string arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
        }
-    };
-
-    // parse the first time to get -hf option (used for remote preset)
-    parse_cli_args();
-
-    // maybe handle remote preset
-    if (!params.model.hf_repo.empty()) {
-        std::string cli_hf_repo = params.model.hf_repo;
-        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
-
-        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
-        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
-        std::string preset_hf_repo = params.model.hf_repo;
-        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
-
-        if (has_preset) {
-            // re-parse CLI args to override preset values
-            parse_cli_args();
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
+        if (!seen_args.insert(arg).second) {
+            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+        }
+        auto & tmp = arg_to_options[arg];
+        auto opt = *tmp.first;
+        bool is_positive = tmp.second;
+        if (opt.has_value_from_env()) {
+            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+        }
+        try {
+            if (opt.handler_void) {
+                opt.handler_void(params);
+                continue;
+            }
+            if (opt.handler_bool) {
+                opt.handler_bool(params, is_positive);
+                continue;
+            }

-        // preserve hf_repo from preset if needed
-        if (preset_has_hf_repo) {
-            params.model.hf_repo = preset_hf_repo;
+            // arg with single value
+            check_arg(i);
+            std::string val = argv[++i];
+            if (opt.handler_int) {
+                opt.handler_int(params, std::stoi(val));
+                continue;
+            }
+            if (opt.handler_string) {
+                opt.handler_string(params, val);
+                continue;
+            }
+
+            // arg with 2 values
+            check_arg(i);
+            std::string val2 = argv[++i];
+            if (opt.handler_str_str) {
+                opt.handler_str_str(params, val, val2);
+                continue;
+            }
+        } catch (std::exception & e) {
+            throw std::invalid_argument(string_format(
+                "error while handling argument \"%s\": %s\n\n"
+                "usage:\n%s\n\nto show complete usage, run with -h",
+                arg.c_str(), e.what(), opt.to_string().c_str()));
        }
    }

@@ -755,6 +679,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-quantize",
        "llama-qwen2vl-cli",
        "llama-retrieval",
+        "llama-run",
        "llama-save-load-state",
        "llama-server",
        "llama-simple",
@@ -929,54 +854,6 @@ bool common_arg_utils::is_autoy(const std::string & value) {
    return value == "auto" || value == "-1";
 }

-// Simple CSV parser that handles quoted fields and escaped quotes
-// example:
-//    input:  value1,"value, with, commas","value with ""escaped"" quotes",value4
-//    output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
-static std::vector<std::string> parse_csv_row(const std::string& input) {
-    std::vector<std::string> fields;
-    std::string field;
-    bool in_quotes = false;
-
-    for (size_t i = 0; i < input.length(); ++i) {
-        char ch = input[i];
-
-        if (ch == '"') {
-            if (!in_quotes) {
-                // start of quoted field (only valid if at beginning of field)
-                if (!field.empty()) {
-                    // quote appeared in middle of unquoted field, treat as literal
-                    field += '"';
-                } else {
-                    in_quotes = true; // start
-                }
-            } else {
-                if (i + 1 < input.length() && input[i + 1] == '"') {
-                    // escaped quote: ""
-                    field += '"';
-                    ++i; // skip the next quote
-                } else {
-                    in_quotes = false; // end
-                }
-            }
-        } else if (ch == ',') {
-            if (in_quotes) {
-                field += ',';
-            } else {
-                fields.push_back(std::move(field));
-                field.clear();
-            }
-        } else {
-            field += ch;
-        }
-    }
-
-    // Add the last field
-    fields.push_back(std::move(field));
-
-    return fields;
-}
-
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    // per-example default params
    // we define here to make sure it's included in llama-gen-docs
@@ -1041,16 +918,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
-    add_opt(common_arg(
-        {"--license"},
-        "show source code license and dependencies",
-        [](common_params &) {
-            for (int i = 0; LICENSES[i]; ++i) {
-                printf("%s\n", LICENSES[i]);
-            }
-            exit(0);
-        }
-    ));
    add_opt(common_arg(
        {"-cl", "--cache-list"},
        "show list of models in cache",
@@ -1295,7 +1162,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.kv_unified = true;
        }
-    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--context-shift"},
        {"--no-context-shift"},
@@ -1383,7 +1250,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--in-file"}, "FNAME",
        "an input file (use comma-separated values to specify multiple files)",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                std::ifstream file(item);
                if (!file) {
                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -1530,7 +1397,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.warmup = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -1729,26 +1596,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_sparam());
-    add_opt(common_arg(
-        {"--adaptive-target"}, "N",
-        string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
-                      "to 1.0; negative = disabled) (default: %.2f)\n"
-                      "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
-                      (double)params.sampling.adaptive_target),
-        [](common_params & params, const std::string & value) {
-            params.sampling.adaptive_target = std::stof(value);
-        }
-    ).set_sparam());
-    add_opt(common_arg(
-        {"--adaptive-decay"}, "N",
-        string_format("adaptive-p: decay rate for target adaptation over time. lower values "
-                      "are more reactive, higher values are more stable.\n"
-                      "(valid range 0.0 to 0.99) (default: %.2f)",
-                      (double)params.sampling.adaptive_decay),
-        [](common_params & params, const std::string & value) {
-            params.sampling.adaptive_decay = std::stof(value);
-        }
-    ).set_sparam());
    add_opt(common_arg(
        {"--dynatemp-range"}, "N",
        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
@@ -1848,13 +1695,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
        }
    ).set_sparam());
-    add_opt(common_arg(
-        {"-bs", "--backend-sampling"},
-        "enable backend sampling (experimental) (default: disabled)",
-        [](common_params & params) {
-            params.sampling.backend_sampling = true;
-        }
-    ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
    add_opt(common_arg(
        {"--pooling"}, "{none,mean,cls,last,rank}",
        "pooling type for embeddings, use model default if unspecified",
@@ -1866,7 +1706,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
            else { throw std::invalid_argument("invalid value"); }
        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
    add_opt(common_arg(
        {"--attention"}, "{causal,non-causal}",
        "attention type for embeddings, use model default if unspecified",
@@ -2155,7 +1995,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--image", "--audio"}, "FILE",
        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                params.image.emplace_back(item);
            }
        }
@@ -2177,7 +2017,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
-            "comma separated list of RPC servers (host:port)",
+            "comma separated list of RPC servers",
            [](common_params & params, const std::string & value) {
                add_rpc_devices(value);
                GGML_UNUSED(params);
@@ -2194,22 +2034,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--mmap"},
        {"--no-mmap"},
-        string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.use_mmap = value;
-            if (value) {
-                params.use_direct_io = false;  // disable direct io when mmap is explicitly enabled
-            }
        }
    ).set_env("LLAMA_ARG_MMAP"));
-    add_opt(common_arg(
-        {"-dio", "--direct-io"},
-        {"-ndio", "--no-direct-io"},
-        string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.use_direct_io = value;
-        }
-    ).set_env("LLAMA_ARG_DIO"));
    add_opt(common_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
@@ -2258,7 +2087,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "override tensor buffer type", [](common_params & params, const std::string & value) {
            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
        }
-    ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
+    ));
    add_opt(common_arg(
        {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
@@ -2308,18 +2137,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
-    GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
-        string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
-        [](common_params & params, const std::string & value) {
-            if (value == "auto") {
-                params.n_gpu_layers = -1;
-            } else if (value == "all") {
-                params.n_gpu_layers = -2;
-            } else {
-                params.n_gpu_layers = std::stoi(value);
-            }
+        string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
+        [](common_params & params, int value) {
+            params.n_gpu_layers = value;
            if (!llama_supports_gpu_offload()) {
                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -2361,7 +2183,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            std::vector<std::string> split_arg{ it, {} };
            if (split_arg.size() >= llama_max_devices()) {
                throw std::invalid_argument(
-                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
                );
            }
            for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2401,28 +2223,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_FIT"));
    add_opt(common_arg(
-        { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
-        string_format("target margin per device for --fit, comma-separated list of values, "
-            "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
-        [](common_params & params, const std::string & value) {
-            std::string arg_next = value;
-
-            // split string by , and /
-            const std::regex regex{ R"([,/]+)" };
-            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-            std::vector<std::string> split_arg{ it, {} };
-            if (split_arg.size() >= llama_max_devices()) {
-                throw std::invalid_argument(
-                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
-                );
-            }
-            if (split_arg.size() == 1) {
-                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
-                return;
-            }
-            for (size_t i = 0; i < split_arg.size(); i++) {
-                params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
-            }
+        { "-fitt", "--fit-target" }, "MiB",
+        string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
+        [](common_params & params, int value) {
+            params.fit_params_target = value * size_t(1024*1024);
        }
    ).set_env("LLAMA_ARG_FIT_TARGET"));
    add_opt(common_arg(
@@ -2441,12 +2245,37 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"--override-kv"}, "KEY=TYPE:VALUE,...",
-        "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
+        "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
-                if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
-                    throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
+            std::vector<std::string> kv_overrides;
+
+            std::string current;
+            bool escaping = false;
+
+            for (const char c : value) {
+                if (escaping) {
+                    current.push_back(c);
+                    escaping = false;
+                } else if (c == '\\') {
+                    escaping = true;
+                } else if (c == ',') {
+                    kv_overrides.push_back(current);
+                    current.clear();
+                } else {
+                    current.push_back(c);
+                }
+            }
+
+            if (escaping) {
+                current.push_back('\\');
+            }
+
+            kv_overrides.push_back(current);
+
+            for (const auto & kv_override : kv_overrides) {
+                if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
+                    throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
                }
            }
        }
@@ -2463,7 +2292,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--lora"}, "FNAME",
        "path to LoRA adapter (use comma-separated values to load multiple adapters)",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
            }
        }
@@ -2474,7 +2303,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
        "note: use comma-separated values",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                auto parts = string_split<std::string>(item, ':');
                if (parts.size() != 2) {
                    throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
@@ -2488,7 +2317,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--control-vector"}, "FNAME",
        "add a control vector\nnote: use comma-separated values to add multiple control vectors",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                params.control_vectors.push_back({ 1.0f, item, });
            }
        }
@@ -2498,7 +2327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "add a control vector with user defined scaling SCALE\n"
        "note: use comma-separated values (format: FNAME:SCALE,...)",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                auto parts = string_split<std::string>(item, ':');
                if (parts.size() != 2) {
                    throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
@@ -2596,7 +2425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--context-file"}, "FNAME",
        "file to load context from (use comma-separated values to specify multiple files)",
        [](common_params & params, const std::string & value) {
-            for (const auto & item : parse_csv_row(value)) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
                std::ifstream file(item, std::ios::binary);
                if (!file) {
                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -2743,7 +2572,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.embd_normalize = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--embd-output-format"}, "FORMAT",
        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2821,7 +2650,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.embedding = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
    add_opt(common_arg(
        {"--rerank", "--reranking"},
        string_format("enable reranking endpoint on server (default: %s)", "disabled"),
@@ -2832,13 +2661,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
    add_opt(common_arg(
        {"--api-key"}, "KEY",
-        "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
+        "API key to use for authentication (default: none)",
        [](common_params & params, const std::string & value) {
-            for (const auto & key : parse_csv_row(value)) {
-                if (!key.empty()) {
-                    params.api_keys.push_back(key);
-                }
-            }
+            params.api_keys.push_back(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
    add_opt(common_arg(
@@ -2852,7 +2677,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            std::string key;
            while (std::getline(key_file, key)) {
                if (!key.empty()) {
-                    params.api_keys.push_back(key);
+                        params.api_keys.push_back(key);
                }
            }
            key_file.close();
@@ -2874,7 +2699,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
    add_opt(common_arg(
        {"--chat-template-kwargs"}, "STRING",
-        "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
+        string_format("sets additional params for the json template parser"),
        [](common_params & params, const std::string & value) {
            auto parsed = json::parse(value);
            for (const auto & item : parsed.items()) {
@@ -2897,18 +2722,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_threads_http = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
-    add_opt(common_arg(
-        {"--cache-prompt"},
-        {"--no-cache-prompt"},
-        string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.cache_prompt = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
    add_opt(common_arg(
        {"--cache-reuse"}, "N",
        string_format(
-            "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
+            "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
            "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
        ),
        [](common_params & params, int value) {
@@ -3358,19 +3175,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.devices = parse_device_list(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
-    GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
    add_opt(common_arg(
        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
-            params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
-        [](common_params & params, const std::string & value) {
-            if (value == "auto") {
-                params.speculative.n_gpu_layers = -1;
-            } else if (value == "all") {
-                params.speculative.n_gpu_layers = -2;
-            } else {
-                params.speculative.n_gpu_layers = std::stoi(value);
-            }
+        "number of layers to store in VRAM for the draft model",
+        [](common_params & params, int value) {
+            params.speculative.n_gpu_layers = value;
            if (!llama_supports_gpu_offload()) {
                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3520,27 +3329,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg(
-        {"--save-logits"},
-        string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
-        [](common_params & params) {
-            params.save_logits = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_DEBUG}));
-    add_opt(common_arg(
-        {"--logits-output-dir"}, "PATH",
-        string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.logits_output_dir = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_DEBUG}));
-    add_opt(common_arg(
-        {"--tensor-filter"}, "REGEX",
-        "filter tensor names for debug output (regex pattern, can be specified multiple times)",
-        [](common_params & params, const std::string & value) {
-            params.tensor_filter.push_back(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_DEBUG}));

    // presets
    add_opt(common_arg(
@@ -3730,15 +3518,15 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
        [](common_params &, const std::string &) { /* unused */ }
    ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());

-    args.push_back(common_arg(
-        {"stop-timeout"}, "SECONDS",
-        "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
-        [](common_params &, int) { /* unused */ }
-    ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
-
    // args.push_back(common_arg(
    //     {"pin"},
    //     "in server router mode, do not unload this model if models_max is exceeded",
    //     [](common_params &) { /* unused */ }
    // ).set_preset_only());
+
+    // args.push_back(common_arg(
+    //     {"unload-idle-seconds"}, "SECONDS",
+    //     "in server router mode, unload models idle for more than this many seconds",
+    //     [](common_params &, int) { /* unused */ }
+    // ).set_preset_only());
 }