Sync from upstream llama.cpp repository
This commit is contained in:
85
ggml/include/ggml-alloc.h
Normal file
85
ggml/include/ggml-alloc.h
Normal file
@@ -0,0 +1,85 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
// Tensor allocator
|
||||
struct ggml_tallocr {
|
||||
ggml_backend_buffer_t buffer;
|
||||
void * base;
|
||||
size_t alignment;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
||||
GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
||||
|
||||
// Graph allocator
|
||||
/*
|
||||
Example usage:
|
||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||
|
||||
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
||||
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
||||
|
||||
// allocate the graph
|
||||
struct ggml_cgraph * graph = build_graph(batch);
|
||||
ggml_gallocr_alloc_graph(galloc, graph);
|
||||
|
||||
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
|
||||
|
||||
// evaluate the graph
|
||||
ggml_backend_graph_compute(backend, graph);
|
||||
*/
|
||||
|
||||
// special tensor flags for use with the graph allocator:
|
||||
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
|
||||
// ggml_set_output(): output tensors are never freed and never overwritten
|
||||
|
||||
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||
|
||||
GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
|
||||
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||
|
||||
// pre-allocate buffers from a measure graph - does not allocate or modify the graph
|
||||
// call with a worst-case graph to avoid buffer reallocations
|
||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||
// returns false if the buffer allocation failed
|
||||
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_gallocr_reserve_n_size(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
const int * node_buffer_ids,
|
||||
const int * leaf_buffer_ids,
|
||||
size_t * sizes);
|
||||
GGML_API bool ggml_gallocr_reserve_n(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
const int * node_buffer_ids,
|
||||
const int * leaf_buffer_ids);
|
||||
|
||||
// automatic reallocation if the topology changes when using a single buffer
|
||||
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
||||
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
373
ggml/include/ggml-backend.h
Normal file
373
ggml/include/ggml-backend.h
Normal file
@@ -0,0 +1,373 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef GGML_BACKEND_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BACKEND_BUILD
|
||||
# define GGML_BACKEND_API __declspec(dllexport) extern
|
||||
# else
|
||||
# define GGML_BACKEND_API __declspec(dllimport) extern
|
||||
# endif
|
||||
# else
|
||||
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
||||
# endif
|
||||
#else
|
||||
# define GGML_BACKEND_API extern
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
typedef struct ggml_backend_reg * ggml_backend_reg_t;
|
||||
typedef struct ggml_backend_device * ggml_backend_dev_t;
|
||||
|
||||
|
||||
//
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
//
|
||||
// Backend (stream)
|
||||
//
|
||||
|
||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
// "offset" refers to the offset in tensor->data for setting/getting data
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// NOTE: will be removed, use device version instead
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// asynchronous copy
|
||||
// the copy is performed after all the currently queued operations in backend_src
|
||||
// backend_dst will wait for the copy to complete before performing other operations
|
||||
// automatic fallback to sync copy if async is not supported
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
||||
|
||||
//
|
||||
// Events
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
//
|
||||
// Backend device
|
||||
//
|
||||
|
||||
enum ggml_backend_dev_type {
|
||||
// CPU device using system memory
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
// GPU device using dedicated memory
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
// integrated GPU device using host memory
|
||||
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||
};
|
||||
|
||||
// functionality supported by the device
|
||||
struct ggml_backend_dev_caps {
|
||||
// asynchronous operations
|
||||
bool async;
|
||||
// pinned host buffer
|
||||
bool host_buffer;
|
||||
// creating buffers from host ptr
|
||||
bool buffer_from_host_ptr;
|
||||
// event synchronization
|
||||
bool events;
|
||||
};
|
||||
|
||||
// all the device properties
|
||||
struct ggml_backend_dev_props {
|
||||
// device name
|
||||
const char * name;
|
||||
// device description
|
||||
const char * description;
|
||||
// device free memory in bytes
|
||||
size_t memory_free;
|
||||
// device total memory in bytes
|
||||
size_t memory_total;
|
||||
// device type
|
||||
enum ggml_backend_dev_type type;
|
||||
// device id
|
||||
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
||||
// if the id is unknown, this should be NULL
|
||||
const char * device_id;
|
||||
// device capabilities
|
||||
struct ggml_backend_dev_caps caps;
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
||||
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
|
||||
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||
|
||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
|
||||
// Split buffer type for tensor parallelism
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||
// Set the number of threads for the backend
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||
// Set the abort callback for the backend
|
||||
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
|
||||
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
|
||||
// Backend (reg) enumeration
|
||||
GGML_API size_t ggml_backend_reg_count(void);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
|
||||
|
||||
// Device enumeration
|
||||
GGML_API size_t ggml_backend_dev_count(void);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
||||
|
||||
// Direct backend (stream) initialization
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
// Load a backend from a dynamic library and register it
|
||||
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
|
||||
// Unload a backend if loaded dynamically and unregister it
|
||||
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
|
||||
// Load all known backends from dynamic libraries
|
||||
GGML_API void ggml_backend_load_all(void);
|
||||
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
//
|
||||
|
||||
// The backend scheduler allows for multiple backend devices to be used together
|
||||
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
||||
// The backends are selected based on:
|
||||
// - the backend that supports the operation
|
||||
// - the location of the pre-allocated tensors (e.g. the weights)
|
||||
/*
|
||||
Example usage:
|
||||
|
||||
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
||||
// preferrably to run on the same backend as the buffer
|
||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
|
||||
// initialize buffers from a max size graph (optional)
|
||||
reserve_graph = build_graph(sched, max_batch_size);
|
||||
|
||||
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
||||
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
||||
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
|
||||
|
||||
ggml_backend_sched_reserve(sched, reserve_graph);
|
||||
|
||||
// compute
|
||||
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
||||
}
|
||||
|
||||
// if there are graph inputs:
|
||||
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
|
||||
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
||||
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
||||
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
||||
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
||||
|
||||
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
||||
// allocate them statically via ggml_backend_alloc_ctx_tensors
|
||||
}
|
||||
*/
|
||||
|
||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
|
||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||
//
|
||||
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
||||
// if the user returns false, the scheduler will cancel the graph compute
|
||||
//
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||
|
||||
// Get the number of splits of the last graph
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
// Split graph without allocating it
|
||||
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
|
||||
// Allocate and compute graph on the backend scheduler
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||
|
||||
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
||||
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
||||
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
||||
struct ggml_backend_graph_copy {
|
||||
ggml_backend_buffer_t buffer;
|
||||
struct ggml_context * ctx_allocated;
|
||||
struct ggml_context * ctx_unallocated;
|
||||
struct ggml_cgraph * graph;
|
||||
};
|
||||
|
||||
// Copy a graph to a different backend
|
||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||
|
||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
// CPU buffer types are always available
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
25
ggml/include/ggml-blas.h
Normal file
25
ggml/include/ggml-blas.h
Normal file
@@ -0,0 +1,25 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for conversion to float
|
||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
123
ggml/include/ggml-cann.h
Normal file
123
ggml/include/ggml-cann.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal in the Software without restriction, including without limitation the
|
||||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
* sell copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Maximum number of CANN devices supported.
|
||||
*/
|
||||
#define GGML_CANN_MAX_DEVICES 16
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||
|
||||
/**
|
||||
* @brief Initializes the CANN backend for a specified device.
|
||||
*
|
||||
* This function initializes the CANN backend for the given device.
|
||||
* It verifies the device index, allocates a context, and creates a backend
|
||||
* instance.
|
||||
*
|
||||
* @param device The index of the device to initialize.
|
||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||
*/
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
|
||||
/**
|
||||
* @brief Checks if a given backend is a CANN backend.
|
||||
*
|
||||
* This function verifies if the provided backend is a CANN backend by comparing
|
||||
* its GUID with the CANN backend's GUID.
|
||||
*
|
||||
* @param backend The backend instance to check.
|
||||
* @return True if the backend is a CANN backend, false otherwise.
|
||||
*/
|
||||
GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the CANN buffer type for a specified device.
|
||||
*
|
||||
* This function initializes and returns the buffer type interface associated
|
||||
* with the given device. It ensures thread-safe access using a mutex.
|
||||
*
|
||||
* @param device The device index for which to retrieve the buffer type.
|
||||
* @return A pointer to the buffer type interface for the specified device, or
|
||||
* nullptr if the device index is out of range.
|
||||
*/
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_buffer_type(int32_t device);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the number of CANN devices available.
|
||||
*
|
||||
* This function returns the number of CANN devices available based on
|
||||
* information obtained from `ggml_cann_info()`.
|
||||
*
|
||||
* @return The number of CANN devices available.
|
||||
*/
|
||||
GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||
*
|
||||
* @return A pointer to the host buffer type interface.
|
||||
*/
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
*
|
||||
* This function sets the specified device, retrieves the SoC name,
|
||||
* and writes it into the provided description buffer.
|
||||
*
|
||||
* @param device The device index to retrieve the description for.
|
||||
* @param description Pointer to a buffer where the description will be written.
|
||||
* @param description_size Size of the description buffer.
|
||||
*/
|
||||
GGML_BACKEND_API void ggml_backend_cann_get_device_description(
|
||||
int32_t device, char* description, size_t description_size);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the memory information of a specific CANN device.
|
||||
*
|
||||
* This function sets the specified device, retrieves the free and total
|
||||
* memory information of the specified type (ACL_HBM_MEM), and stores them
|
||||
* in the provided pointers.
|
||||
*
|
||||
* @param device The device index to retrieve memory information for.
|
||||
* @param free Pointer to a variable where the free memory size will be stored.
|
||||
* @param total Pointer to a variable where the total memory size will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||
size_t* free,
|
||||
size_t* total);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
39
ggml/include/ggml-cpp.h
Normal file
39
ggml/include/ggml-cpp.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
#include <memory>
|
||||
|
||||
// Smart pointers for ggml types
|
||||
|
||||
// ggml
|
||||
|
||||
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
|
||||
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
|
||||
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
||||
|
||||
// ggml-alloc
|
||||
|
||||
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
|
||||
// ggml-backend
|
||||
|
||||
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
|
||||
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
|
||||
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
|
||||
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
|
||||
146
ggml/include/ggml-cpu.h
Normal file
146
ggml/include/ggml-cpu.h
Normal file
@@ -0,0 +1,146 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggml-org/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
||||
// x86
|
||||
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_bmi2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
||||
// ARM
|
||||
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
||||
// other
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_t from_float;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
};
|
||||
|
||||
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_init(void);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
47
ggml/include/ggml-cuda.h
Normal file
47
ggml/include/ggml-cuda.h
Normal file
@@ -0,0 +1,47 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_HIP
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_NAME "MUSA"
|
||||
#define GGML_CUBLAS_NAME "muBLAS"
|
||||
#else
|
||||
#define GGML_CUDA_NAME "CUDA"
|
||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||
#endif
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
19
ggml/include/ggml-hexagon.h
Normal file
19
ggml/include/ggml-hexagon.h
Normal file
@@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
61
ggml/include/ggml-metal.h
Normal file
61
ggml/include/ggml-metal.h
Normal file
@@ -0,0 +1,61 @@
|
||||
// Note: this description is outdated
|
||||
//
|
||||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
||||
//
|
||||
// How it works?
|
||||
//
|
||||
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
||||
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
||||
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
||||
//
|
||||
// You only need to make sure that all memory buffers that you used during the graph creation
|
||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||
//
|
||||
// Synchronization between device and host memory (for example for input and output tensors)
|
||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// backend API
|
||||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
// TODO: remove in the future
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
26
ggml/include/ggml-opencl.h
Normal file
26
ggml/include/ggml-opencl.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#ifndef GGML_OPENCL_H
|
||||
#define GGML_OPENCL_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // GGML_OPENCL_H
|
||||
256
ggml/include/ggml-opt.h
Normal file
256
ggml/include/ggml-opt.h
Normal file
@@ -0,0 +1,256 @@
|
||||
// This file contains functionality for training models using GGML.
|
||||
// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
|
||||
// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
|
||||
//
|
||||
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_opt_dataset;
|
||||
struct ggml_opt_context;
|
||||
struct ggml_opt_result;
|
||||
|
||||
typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
|
||||
typedef struct ggml_opt_context * ggml_opt_context_t;
|
||||
typedef struct ggml_opt_result * ggml_opt_result_t;
|
||||
|
||||
// ====== Loss ======
|
||||
|
||||
// built-in loss types, i.e. the built-in quantities minimized by the optimizer
|
||||
// custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
|
||||
enum ggml_opt_loss_type {
|
||||
GGML_OPT_LOSS_TYPE_MEAN,
|
||||
GGML_OPT_LOSS_TYPE_SUM,
|
||||
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
|
||||
GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
|
||||
};
|
||||
|
||||
// ====== Dataset ======
|
||||
|
||||
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
|
||||
enum ggml_type type_data, // the type for the internal data tensor
|
||||
enum ggml_type type_label, // the type for the internal labels tensor
|
||||
int64_t ne_datapoint, // number of elements per datapoint
|
||||
int64_t ne_label, // number of elements per label
|
||||
int64_t ndata, // total number of datapoints/labels
|
||||
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
|
||||
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
|
||||
|
||||
// get underlying tensors that store the data
|
||||
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
|
||||
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
|
||||
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
|
||||
|
||||
// shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
|
||||
GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
|
||||
|
||||
// get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
|
||||
GGML_API void ggml_opt_dataset_get_batch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
|
||||
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
|
||||
int64_t ibatch);
|
||||
GGML_API void ggml_opt_dataset_get_batch_host(
|
||||
ggml_opt_dataset_t dataset,
|
||||
void * data_batch,
|
||||
size_t nb_data_batch,
|
||||
void * labels_batch,
|
||||
int64_t ibatch);
|
||||
|
||||
// ====== Model / Context ======
|
||||
|
||||
enum ggml_opt_build_type {
|
||||
GGML_OPT_BUILD_TYPE_FORWARD = 10,
|
||||
GGML_OPT_BUILD_TYPE_GRAD = 20,
|
||||
GGML_OPT_BUILD_TYPE_OPT = 30,
|
||||
};
|
||||
|
||||
enum ggml_opt_optimizer_type {
|
||||
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
GGML_OPT_OPTIMIZER_TYPE_SGD,
|
||||
|
||||
GGML_OPT_OPTIMIZER_TYPE_COUNT
|
||||
};
|
||||
|
||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||
struct ggml_opt_optimizer_params {
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float beta1; // first AdamW momentum
|
||||
float beta2; // second AdamW momentum
|
||||
float eps; // epsilon for numerical stability
|
||||
float wd; // weight decay - 0.0f to disable
|
||||
} adamw;
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float wd; // weight decay
|
||||
} sgd;
|
||||
};
|
||||
|
||||
// callback to calculate optimizer parameters prior to a backward pass
|
||||
// userdata can be used to pass arbitrary data
|
||||
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
|
||||
|
||||
// returns the default optimizer params (constant, hard-coded values)
|
||||
// userdata is not used
|
||||
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
|
||||
|
||||
// casts userdata to ggml_opt_optimizer_params and returns it
|
||||
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
|
||||
|
||||
// parameters for initializing a new optimization context
|
||||
struct ggml_opt_params {
|
||||
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
|
||||
|
||||
// by default the forward graph needs to be reconstructed for each eval
|
||||
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
|
||||
struct ggml_context * ctx_compute;
|
||||
struct ggml_tensor * inputs;
|
||||
struct ggml_tensor * outputs;
|
||||
|
||||
enum ggml_opt_loss_type loss_type;
|
||||
enum ggml_opt_build_type build_type;
|
||||
|
||||
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
||||
enum ggml_opt_optimizer_type optimizer;
|
||||
};
|
||||
|
||||
// get parameters for an optimization context with defaults set where possible
|
||||
// parameters for which no sensible defaults exist are supplied as arguments to this function
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(
|
||||
ggml_backend_sched_t backend_sched,
|
||||
enum ggml_opt_loss_type loss_type);
|
||||
|
||||
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
|
||||
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
|
||||
|
||||
// set gradients to zero, initilize loss, and optionally reset the optimizer
|
||||
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
||||
|
||||
GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
|
||||
|
||||
// get underlying tensors that store data
|
||||
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
|
||||
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
|
||||
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
|
||||
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
|
||||
GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
|
||||
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
|
||||
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
|
||||
|
||||
// get the gradient accumulator for a node from the forward graph
|
||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||
|
||||
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
|
||||
|
||||
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
|
||||
|
||||
// ====== Optimization Result ======
|
||||
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
||||
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
|
||||
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
|
||||
|
||||
// get data from result, uncertainties are optional and can be ignored by passing NULL
|
||||
GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints
|
||||
GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value
|
||||
GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values
|
||||
GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value
|
||||
|
||||
// ====== Computation ======
|
||||
|
||||
// if not using static graphs, this function must be called prior to ggml_opt_alloc
|
||||
GGML_API void ggml_opt_prepare_alloc(
|
||||
ggml_opt_context_t opt_ctx,
|
||||
struct ggml_context * ctx_compute,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * inputs,
|
||||
struct ggml_tensor * outputs);
|
||||
|
||||
// allocate the next graph for evaluation, either forward or forward + backward
|
||||
// must be called exactly once prior to calling ggml_opt_eval
|
||||
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
|
||||
|
||||
// do forward pass, increment result if not NULL, do backward pass if allocated
|
||||
GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
||||
|
||||
// ############################################################################
|
||||
// ## The high-level functions start here. They do not depend on any private ##
|
||||
// ## functions or structs and can be copied to and adapted for user code. ##
|
||||
// ############################################################################
|
||||
|
||||
// ====== Intended Usage ======
|
||||
//
|
||||
// 1. Select the appropriate loss for your problem.
|
||||
// 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
|
||||
// Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
|
||||
// 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
|
||||
// The first context should contain the model parameters and inputs and be allocated statically in user code.
|
||||
// The second context should contain all other tensors and will be (re)allocated automatically.
|
||||
// Due to this automated allocation the data of the second context is not defined when accessed in user code.
|
||||
// Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
|
||||
// 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
|
||||
|
||||
// signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
|
||||
typedef void (*ggml_opt_epoch_callback)(
|
||||
bool train, // true after training evaluation, false after validation evaluation
|
||||
ggml_opt_context_t opt_ctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result, // result associated with the dataset subsection
|
||||
int64_t ibatch, // number of batches that have been evaluated so far
|
||||
int64_t ibatch_max, // total number of batches in this dataset subsection
|
||||
int64_t t_start_us); // time at which the evaluation on the dataset subsection was started
|
||||
|
||||
// do training on front of dataset, do evaluation only on back of dataset
|
||||
GGML_API void ggml_opt_epoch(
|
||||
ggml_opt_context_t opt_ctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train, // result to increment during training, ignored if NULL
|
||||
ggml_opt_result_t result_eval, // result to increment during evaluation, ignored if NULL
|
||||
int64_t idata_split, // data index at which to split training and evaluation
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval);
|
||||
|
||||
// callback that prints a progress bar on stderr
|
||||
GGML_API void ggml_opt_epoch_callback_progress_bar(
|
||||
bool train,
|
||||
ggml_opt_context_t opt_ctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result,
|
||||
int64_t ibatch,
|
||||
int64_t ibatch_max,
|
||||
int64_t t_start_us);
|
||||
|
||||
// fit model defined by inputs and outputs to dataset
|
||||
GGML_API void ggml_opt_fit(
|
||||
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
|
||||
struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
|
||||
struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
|
||||
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
|
||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||
int64_t nepoch, // how many times the dataset should be iterated over
|
||||
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
||||
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
||||
bool silent); // whether or not info prints to stderr should be suppressed
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
30
ggml/include/ggml-rpc.h
Normal file
30
ggml/include/ggml-rpc.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 3
|
||||
#define RPC_PROTO_MINOR_VERSION 6
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
|
||||
GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
|
||||
size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
49
ggml/include/ggml-sycl.h
Normal file
49
ggml/include/ggml-sycl.h
Normal file
@@ -0,0 +1,49 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#define GGML_SYCL_NAME "SYCL"
|
||||
#define GGML_SYCL_MAX_DEVICES 48
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
|
||||
// devide buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
|
||||
char *description,
|
||||
size_t description_size);
|
||||
GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
|
||||
GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
|
||||
// SYCL doesn't support registering host memory, keep here for reference
|
||||
// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||
// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
29
ggml/include/ggml-vulkan.h
Normal file
29
ggml/include/ggml-vulkan.h
Normal file
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_VK_NAME "Vulkan"
|
||||
#define GGML_VK_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||
GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
|
||||
GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
19
ggml/include/ggml-webgpu.h
Normal file
19
ggml/include/ggml-webgpu.h
Normal file
@@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_WEBGPU_NAME "WebGPU"
|
||||
|
||||
// Needed for examples in ggml
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
17
ggml/include/ggml-zdnn.h
Normal file
17
ggml/include/ggml-zdnn.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
ggml/include/ggml-zendnn.h
Normal file
22
ggml/include/ggml-zendnn.h
Normal file
@@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for zendnn operations
|
||||
GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
2724
ggml/include/ggml.h
Normal file
2724
ggml/include/ggml.h
Normal file
File diff suppressed because it is too large
Load Diff
202
ggml/include/gguf.h
Normal file
202
ggml/include/gguf.h
Normal file
@@ -0,0 +1,202 @@
|
||||
// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
|
||||
// GGUF files have the following structure:
|
||||
//
|
||||
// 1. File magic "GGUF" (4 bytes).
|
||||
// 2. File version (uint32_t).
|
||||
// 3. Number of ggml tensors in file (int64_t).
|
||||
// 4. Number of key-value-pairs in file (int64_t).
|
||||
// 5. For each KV pair:
|
||||
// 1. The key (string).
|
||||
// 2. The value type (gguf_type).
|
||||
// 3a. If the value type is GGUF_TYPE_ARRAY:
|
||||
// 1. The type of the array (gguf_type).
|
||||
// 2. The number of elements in the array (uint64_t).
|
||||
// 3. The binary representation of each element in the array.
|
||||
// 3b. Otherwise:
|
||||
// 1. The binary representation of the value.
|
||||
// 6. For each ggml tensor:
|
||||
// 1. The tensor name (string).
|
||||
// 2. The number of dimensions of the tensor (uint32_t).
|
||||
// 3. For each dimension:
|
||||
// 1. The size of the tensor in the dimension (int64_t).
|
||||
// 4. The tensor data type (ggml_type).
|
||||
// 5. The tensor data offset in the tensor data binary blob (uint64_t).
|
||||
// 7. The tensor data binary blob (optional, aligned).
|
||||
//
|
||||
// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
|
||||
// All enums are stored as int32_t.
|
||||
// All bool values are stored as int8_t.
|
||||
// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
|
||||
// otherwise GGUF_DEFAULT_ALIGNMENT is used.
|
||||
//
|
||||
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
#define GGUF_VERSION 3
|
||||
|
||||
#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// types that can be stored as GGUF KV data
|
||||
enum gguf_type {
|
||||
GGUF_TYPE_UINT8 = 0,
|
||||
GGUF_TYPE_INT8 = 1,
|
||||
GGUF_TYPE_UINT16 = 2,
|
||||
GGUF_TYPE_INT16 = 3,
|
||||
GGUF_TYPE_UINT32 = 4,
|
||||
GGUF_TYPE_INT32 = 5,
|
||||
GGUF_TYPE_FLOAT32 = 6,
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_UINT64 = 10,
|
||||
GGUF_TYPE_INT64 = 11,
|
||||
GGUF_TYPE_FLOAT64 = 12,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
struct gguf_context;
|
||||
|
||||
struct gguf_init_params {
|
||||
bool no_alloc;
|
||||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
|
||||
GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
|
||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
// will abort if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
|
||||
GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
// get raw pointer to the first element of the array with the given key_id
|
||||
// for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
|
||||
|
||||
// get ith C string from array with given key_id
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||
|
||||
GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
|
||||
GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id);
|
||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id);
|
||||
GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id);
|
||||
|
||||
// removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
|
||||
GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||
|
||||
// overrides an existing KV pair or adds a new one, the new KV pair is always at the back
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
|
||||
// creates a new array with n elements of the given type and copies the corresponding number of bytes from data
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
|
||||
|
||||
// creates a new array with n strings and copies the corresponding strings from data
|
||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
|
||||
|
||||
// set or add KV pairs from another context
|
||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
|
||||
|
||||
// add tensor to GGUF context, tensor name must be unique
|
||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||
|
||||
// after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
|
||||
// in such a way that the tensor data remains as one contiguous block (except for padding)
|
||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||
|
||||
// assumes that at least gguf_get_tensor_size bytes can be read from data
|
||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
|
||||
|
||||
// writing gguf files can be done in 3 ways:
|
||||
//
|
||||
// - write the entire gguf_context to a binary file in a single pass:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
|
||||
//
|
||||
// - write only the meta data to a file, then re-open the file and append the tensor data:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
|
||||
// FILE * f = fopen(fname, "ab");
|
||||
// fwrite(f, ...); // write tensor data
|
||||
// fclose(f);
|
||||
//
|
||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||
//
|
||||
// FILE * f = fopen(fname, "wb");
|
||||
// const size_t size_meta = gguf_get_meta_size(ctx);
|
||||
// fseek(f, size_meta, SEEK_SET);
|
||||
// fwrite(f, ...); // write tensor data
|
||||
// void * data = malloc(size_meta);
|
||||
// gguf_get_meta_data(ctx, data);
|
||||
// rewind(f);
|
||||
// fwrite(data, 1, data, f);
|
||||
// free(data);
|
||||
// fclose(f);
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
|
||||
// writes the meta data to pointer "data"
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user