2024-07-23 13:10:17 +03:00
|
|
|
#include "llama-impl.h"
|
2025-01-03 10:18:53 +02:00
|
|
|
|
|
|
|
|
#include "llama-chat.h"
|
|
|
|
|
#include "llama-mmap.h"
|
2024-07-23 13:10:17 +03:00
|
|
|
#include "llama-vocab.h"
|
2025-01-03 10:18:53 +02:00
|
|
|
#include "llama-model-loader.h"
|
|
|
|
|
#include "llama-model.h"
|
2023-10-03 09:16:26 +02:00
|
|
|
|
2023-03-22 07:32:36 +02:00
|
|
|
#include "ggml.h"
|
2023-12-21 21:07:46 +01:00
|
|
|
#include "ggml-backend.h"
|
2023-08-21 23:07:43 +03:00
|
|
|
|
|
|
|
|
#include <algorithm>
|
2023-08-26 14:17:51 -04:00
|
|
|
#include <cstddef>
|
|
|
|
|
#include <cstdint>
|
|
|
|
|
#include <cstdio>
|
2023-08-21 23:07:43 +03:00
|
|
|
#include <cstring>
|
|
|
|
|
#include <ctime>
|
2023-03-29 13:51:37 -07:00
|
|
|
|
2023-06-16 21:23:53 +03:00
|
|
|
#if defined(_MSC_VER)
|
|
|
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
//
|
|
|
|
|
// interface implementation
|
|
|
|
|
//
|
2023-05-02 22:26:13 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
|
|
|
struct llama_sampler_chain_params result = {
|
|
|
|
|
/*.no_perf =*/ true,
|
|
|
|
|
};
|
2023-04-24 07:40:02 +03:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
return result;
|
|
|
|
|
}
|
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
size_t llama_max_devices(void) {
|
|
|
|
|
return 16;
|
|
|
|
|
}
|
2023-04-24 07:40:02 +03:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
bool llama_supports_mmap(void) {
|
|
|
|
|
return llama_mmap::SUPPORTED;
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
bool llama_supports_mlock(void) {
|
|
|
|
|
return llama_mlock::SUPPORTED;
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
bool llama_supports_gpu_offload(void) {
|
|
|
|
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
|
|
|
|
llama_supports_rpc();
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
bool llama_supports_rpc(void) {
|
|
|
|
|
return ggml_backend_reg_by_name("RPC") != nullptr;
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
void llama_backend_init(void) {
|
|
|
|
|
ggml_time_init();
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
// needed to initialize f16 tables
|
|
|
|
|
{
|
|
|
|
|
struct ggml_init_params params = { 0, NULL, false };
|
|
|
|
|
struct ggml_context * ctx = ggml_init(params);
|
|
|
|
|
ggml_free(ctx);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
|
|
|
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
|
|
|
|
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
|
|
|
GGML_ASSERT(dev && "CPU backend is not loaded");
|
|
|
|
|
auto * reg = ggml_backend_dev_backend_reg(dev);
|
|
|
|
|
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
|
|
|
|
numa_init_fn(numa);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
void llama_backend_free(void) {
|
|
|
|
|
ggml_quantize_free();
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
int64_t llama_time_us(void) {
|
|
|
|
|
return ggml_time_us();
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
|
|
|
|
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
|
|
|
|
// loading time will be recalculated after the first eval, so
|
|
|
|
|
// we take page faults deferred by mmap() into consideration
|
|
|
|
|
model.t_load_us = 0;
|
|
|
|
|
time_meas tm(model.t_load_us);
|
|
|
|
|
|
|
|
|
|
model.t_start_us = tm.t_start_us;
|
|
|
|
|
|
|
|
|
|
try {
|
2025-04-02 14:52:01 +02:00
|
|
|
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
|
ml.print_info();
|
|
|
|
|
|
|
|
|
|
model.hparams.vocab_only = params.vocab_only;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
model.load_arch(ml);
|
|
|
|
|
} catch(const std::exception & e) {
|
|
|
|
|
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
model.load_hparams(ml);
|
|
|
|
|
} catch(const std::exception & e) {
|
|
|
|
|
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
model.load_vocab(ml);
|
|
|
|
|
} catch(const std::exception & e) {
|
|
|
|
|
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
model.load_stats(ml);
|
|
|
|
|
model.print_info();
|
|
|
|
|
|
|
|
|
|
if (params.vocab_only) {
|
|
|
|
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!model.load_tensors(ml)) {
|
|
|
|
|
return -2;
|
|
|
|
|
}
|
|
|
|
|
} catch (const std::exception & err) {
|
|
|
|
|
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-16 13:54:08 +01:00
|
|
|
static struct llama_model * llama_model_load_from_file_impl(
|
|
|
|
|
const std::string & path_model,
|
|
|
|
|
std::vector<std::string> & splits,
|
2025-01-06 10:55:18 +02:00
|
|
|
struct llama_model_params params) {
|
2025-01-03 10:18:53 +02:00
|
|
|
ggml_time_init();
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
unsigned cur_percentage = 0;
|
|
|
|
|
if (params.progress_callback == NULL) {
|
|
|
|
|
params.progress_callback_user_data = &cur_percentage;
|
|
|
|
|
params.progress_callback = [](float progress, void * ctx) {
|
|
|
|
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
|
|
|
|
unsigned percentage = (unsigned) (100 * progress);
|
|
|
|
|
while (percentage > *cur_percentage_p) {
|
|
|
|
|
*cur_percentage_p = percentage;
|
|
|
|
|
LLAMA_LOG_CONT(".");
|
|
|
|
|
if (percentage >= 100) {
|
|
|
|
|
LLAMA_LOG_CONT("\n");
|
2024-07-28 00:42:05 -04:00
|
|
|
}
|
|
|
|
|
}
|
2025-01-03 10:18:53 +02:00
|
|
|
return true;
|
|
|
|
|
};
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-02-07 15:48:47 +02:00
|
|
|
llama_model * model = new llama_model(params);
|
|
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
// create list of devices to use with this model
|
|
|
|
|
if (params.devices) {
|
|
|
|
|
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
|
|
|
|
model->devices.push_back(*dev);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2025-01-26 23:20:34 +08:00
|
|
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
2025-01-03 10:18:53 +02:00
|
|
|
// use all available devices
|
|
|
|
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
|
|
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
|
|
|
switch (ggml_backend_dev_type(dev)) {
|
|
|
|
|
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
|
|
|
|
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
|
|
|
|
// skip CPU backends since they are handled separately
|
|
|
|
|
break;
|
2023-10-03 21:04:01 +03:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
2025-01-26 23:20:34 +08:00
|
|
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
|
|
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
|
|
|
rpc_servers.push_back(dev);
|
|
|
|
|
} else {
|
|
|
|
|
model->devices.push_back(dev);
|
|
|
|
|
}
|
2025-01-03 10:18:53 +02:00
|
|
|
break;
|
2024-07-28 00:42:05 -04:00
|
|
|
}
|
|
|
|
|
}
|
2025-01-26 23:20:34 +08:00
|
|
|
// add RPC servers at the front of the list
|
|
|
|
|
if (!rpc_servers.empty()) {
|
|
|
|
|
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
}
|
2023-10-03 21:04:01 +03:00
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
// if using single GPU mode, remove all except the main GPU
|
|
|
|
|
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
|
|
|
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
|
|
|
|
|
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
|
2025-01-06 10:55:18 +02:00
|
|
|
llama_model_free(model);
|
2025-01-03 10:18:53 +02:00
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
|
|
|
|
model->devices.clear();
|
|
|
|
|
model->devices.push_back(main_gpu);
|
2024-07-28 00:42:05 -04:00
|
|
|
}
|
|
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
for (auto * dev : model->devices) {
|
|
|
|
|
size_t free, total; // NOLINT
|
|
|
|
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
|
|
|
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
2024-08-08 23:54:00 -04:00
|
|
|
}
|
|
|
|
|
|
2025-01-16 13:54:08 +01:00
|
|
|
const int status = llama_model_load(path_model, splits, *model, params);
|
2025-01-03 10:18:53 +02:00
|
|
|
GGML_ASSERT(status <= 0);
|
|
|
|
|
if (status < 0) {
|
|
|
|
|
if (status == -1) {
|
|
|
|
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
|
|
|
} else if (status == -2) {
|
|
|
|
|
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-06 10:55:18 +02:00
|
|
|
llama_model_free(model);
|
2025-01-03 10:18:53 +02:00
|
|
|
return nullptr;
|
2024-07-28 00:42:05 -04:00
|
|
|
}
|
|
|
|
|
|
2025-01-03 10:18:53 +02:00
|
|
|
return model;
|
|
|
|
|
}
|
2024-07-28 00:42:05 -04:00
|
|
|
|
2025-01-16 13:54:08 +01:00
|
|
|
// deprecated
|
|
|
|
|
struct llama_model * llama_load_model_from_file(
|
|
|
|
|
const char * path_model,
|
|
|
|
|
struct llama_model_params params) {
|
|
|
|
|
return llama_model_load_from_file(path_model, params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct llama_model * llama_model_load_from_file(
|
|
|
|
|
const char * path_model,
|
|
|
|
|
struct llama_model_params params) {
|
|
|
|
|
std::vector<std::string> splits = {};
|
|
|
|
|
return llama_model_load_from_file_impl(path_model, splits, params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct llama_model * llama_model_load_from_splits(
|
|
|
|
|
const char ** paths,
|
|
|
|
|
size_t n_paths,
|
|
|
|
|
struct llama_model_params params) {
|
|
|
|
|
std::vector<std::string> splits;
|
|
|
|
|
if (n_paths == 0) {
|
|
|
|
|
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
for (size_t i = 0; i < n_paths; ++i) {
|
|
|
|
|
splits.push_back(paths[i]);
|
|
|
|
|
}
|
|
|
|
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 13:10:17 +03:00
|
|
|
//
|
|
|
|
|
// chat templates
|
|
|
|
|
//
|
2024-02-19 09:23:37 +01:00
|
|
|
|
2024-07-23 13:10:17 +03:00
|
|
|
int32_t llama_chat_apply_template(
|
2024-02-19 09:23:37 +01:00
|
|
|
const char * tmpl,
|
|
|
|
|
const struct llama_chat_message * chat,
|
|
|
|
|
size_t n_msg,
|
|
|
|
|
bool add_ass,
|
|
|
|
|
char * buf,
|
|
|
|
|
int32_t length) {
|
2025-01-12 11:32:42 +02:00
|
|
|
const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
|
2024-03-07 11:41:53 +02:00
|
|
|
|
2024-02-19 09:23:37 +01:00
|
|
|
// format the chat to string
|
|
|
|
|
std::vector<const llama_chat_message *> chat_vec;
|
|
|
|
|
chat_vec.resize(n_msg);
|
|
|
|
|
for (size_t i = 0; i < n_msg; i++) {
|
|
|
|
|
chat_vec[i] = &chat[i];
|
|
|
|
|
}
|
2024-03-07 11:41:53 +02:00
|
|
|
|
2024-02-19 09:23:37 +01:00
|
|
|
std::string formatted_chat;
|
2025-01-03 10:18:53 +02:00
|
|
|
llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
|
2024-12-02 22:10:19 +01:00
|
|
|
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
2025-01-03 10:18:53 +02:00
|
|
|
int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
|
2024-02-19 09:23:37 +01:00
|
|
|
if (res < 0) {
|
|
|
|
|
return res;
|
|
|
|
|
}
|
2024-03-07 11:41:53 +02:00
|
|
|
if (buf && length > 0) {
|
|
|
|
|
strncpy(buf, formatted_chat.c_str(), length);
|
|
|
|
|
}
|
2024-02-19 09:23:37 +01:00
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 13:10:17 +03:00
|
|
|
//
|
2024-09-07 15:16:19 +03:00
|
|
|
// model split
|
2024-07-23 13:10:17 +03:00
|
|
|
//
|
|
|
|
|
|
|
|
|
|
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
2024-03-22 19:00:01 +01:00
|
|
|
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
|
|
|
|
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
|
|
|
|
return strlen(split_path);
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-06 17:52:35 +02:00
|
|
|
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
2024-03-22 19:00:01 +01:00
|
|
|
std::string str_split_path(split_path);
|
|
|
|
|
char postfix[32];
|
|
|
|
|
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
|
|
|
|
std::string str_postfix(postfix);
|
|
|
|
|
|
2025-01-06 17:52:35 +02:00
|
|
|
// check if split_prefix ends with postfix
|
2024-03-22 19:00:01 +01:00
|
|
|
int size_prefix = str_split_path.size() - str_postfix.size();
|
|
|
|
|
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
2025-01-06 17:52:35 +02:00
|
|
|
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
2024-03-22 19:00:01 +01:00
|
|
|
return size_prefix;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-22 07:32:36 +02:00
|
|
|
const char * llama_print_system_info(void) {
|
|
|
|
|
static std::string s;
|
2025-01-06 12:21:46 +01:00
|
|
|
s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
|
|
|
|
|
|
2024-11-25 15:13:39 +01:00
|
|
|
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
|
|
|
auto * reg = ggml_backend_reg_get(i);
|
|
|
|
|
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
|
|
|
|
if (get_features_fn) {
|
|
|
|
|
ggml_backend_feature * features = get_features_fn(reg);
|
|
|
|
|
s += ggml_backend_reg_name(reg);
|
|
|
|
|
s += " : ";
|
|
|
|
|
for (; features->name; features++) {
|
|
|
|
|
s += features->name;
|
|
|
|
|
s += " = ";
|
|
|
|
|
s += features->value;
|
|
|
|
|
s += " | ";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-03-22 07:32:36 +02:00
|
|
|
|
|
|
|
|
return s.c_str();
|
|
|
|
|
}
|