2024-07-23 13:10:17 +03:00
# include "llama-impl.h"
2025-01-03 10:18:53 +02:00
# include "llama-chat.h"
# include "llama-mmap.h"
2024-07-23 13:10:17 +03:00
# include "llama-vocab.h"
2025-01-03 10:18:53 +02:00
# include "llama-model-loader.h"
2025-05-12 14:44:49 +02:00
# include "llama-model-saver.h"
2025-01-03 10:18:53 +02:00
# include "llama-model.h"
2023-10-03 09:16:26 +02:00
2023-03-22 07:32:36 +02:00
# include "ggml.h"
2023-12-21 21:07:46 +01:00
# include "ggml-backend.h"
2023-08-21 23:07:43 +03:00
# include <algorithm>
2023-08-26 14:17:51 -04:00
# include <cstddef>
# include <cstdint>
# include <cstdio>
2023-08-21 23:07:43 +03:00
# include <cstring>
# include <ctime>
2023-03-29 13:51:37 -07:00
2023-06-16 21:23:53 +03:00
# if defined(_MSC_VER)
# pragma warning(disable: 4244 4267) // possible loss of data
# endif
2025-01-03 10:18:53 +02:00
//
// interface implementation
//
2023-05-02 22:26:13 -04:00
2025-01-03 10:18:53 +02:00
struct llama_sampler_chain_params llama_sampler_chain_default_params ( ) {
struct llama_sampler_chain_params result = {
/*.no_perf =*/ true ,
} ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
return result ;
}
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
size_t llama_max_devices ( void ) {
return 16 ;
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
bool llama_supports_mmap ( void ) {
return llama_mmap : : SUPPORTED ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
bool llama_supports_mlock ( void ) {
return llama_mlock : : SUPPORTED ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
bool llama_supports_gpu_offload ( void ) {
return ggml_backend_dev_by_type ( GGML_BACKEND_DEVICE_TYPE_GPU ) ! = nullptr | |
llama_supports_rpc ( ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
bool llama_supports_rpc ( void ) {
return ggml_backend_reg_by_name ( " RPC " ) ! = nullptr ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
void llama_backend_init ( void ) {
ggml_time_init ( ) ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0 , NULL , false } ;
struct ggml_context * ctx = ggml_init ( params ) ;
ggml_free ( ctx ) ;
}
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
void llama_numa_init ( enum ggml_numa_strategy numa ) {
if ( numa ! = GGML_NUMA_STRATEGY_DISABLED ) {
auto * dev = ggml_backend_dev_by_type ( GGML_BACKEND_DEVICE_TYPE_CPU ) ;
GGML_ASSERT ( dev & & " CPU backend is not loaded " ) ;
auto * reg = ggml_backend_dev_backend_reg ( dev ) ;
auto * numa_init_fn = ( decltype ( ggml_numa_init ) * ) ggml_backend_reg_get_proc_address ( reg , " ggml_backend_cpu_numa_init " ) ;
numa_init_fn ( numa ) ;
}
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
void llama_backend_free ( void ) {
ggml_quantize_free ( ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
int64_t llama_time_us ( void ) {
return ggml_time_us ( ) ;
}
2024-07-28 00:42:05 -04:00
2025-03-13 12:35:44 +02:00
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load ( const std : : string & fname , std : : vector < std : : string > & splits , llama_model & model , llama_model_params & params ) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model . t_load_us = 0 ;
time_meas tm ( model . t_load_us ) ;
model . t_start_us = tm . t_start_us ;
try {
2025-04-02 14:52:01 +02:00
llama_model_loader ml ( fname , splits , params . use_mmap , params . check_tensors , params . kv_overrides , params . tensor_buft_overrides ) ;
2025-03-13 12:35:44 +02:00
ml . print_info ( ) ;
model . hparams . vocab_only = params . vocab_only ;
try {
model . load_arch ( ml ) ;
} catch ( const std : : exception & e ) {
throw std : : runtime_error ( " error loading model architecture: " + std : : string ( e . what ( ) ) ) ;
}
try {
model . load_hparams ( ml ) ;
} catch ( const std : : exception & e ) {
throw std : : runtime_error ( " error loading model hyperparameters: " + std : : string ( e . what ( ) ) ) ;
}
try {
model . load_vocab ( ml ) ;
} catch ( const std : : exception & e ) {
throw std : : runtime_error ( " error loading model vocabulary: " + std : : string ( e . what ( ) ) ) ;
}
model . load_stats ( ml ) ;
model . print_info ( ) ;
if ( params . vocab_only ) {
LLAMA_LOG_INFO ( " %s: vocab only - skipping tensors \n " , __func__ ) ;
return 0 ;
}
if ( ! model . load_tensors ( ml ) ) {
return - 2 ;
}
} catch ( const std : : exception & err ) {
LLAMA_LOG_ERROR ( " %s: error loading model: %s \n " , __func__ , err . what ( ) ) ;
return - 1 ;
}
return 0 ;
}
2025-01-16 13:54:08 +01:00
static struct llama_model * llama_model_load_from_file_impl (
const std : : string & path_model ,
std : : vector < std : : string > & splits ,
2025-01-06 10:55:18 +02:00
struct llama_model_params params ) {
2025-01-03 10:18:53 +02:00
ggml_time_init ( ) ;
2024-07-28 00:42:05 -04:00
2025-05-16 07:38:07 -07:00
if ( ! params . vocab_only & & ggml_backend_reg_count ( ) = = 0 ) {
LLAMA_LOG_ERROR ( " %s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function \n " , __func__ ) ;
return nullptr ;
}
2025-01-03 10:18:53 +02:00
unsigned cur_percentage = 0 ;
if ( params . progress_callback = = NULL ) {
params . progress_callback_user_data = & cur_percentage ;
params . progress_callback = [ ] ( float progress , void * ctx ) {
unsigned * cur_percentage_p = ( unsigned * ) ctx ;
unsigned percentage = ( unsigned ) ( 100 * progress ) ;
while ( percentage > * cur_percentage_p ) {
* cur_percentage_p = percentage ;
LLAMA_LOG_CONT ( " . " ) ;
if ( percentage > = 100 ) {
LLAMA_LOG_CONT ( " \n " ) ;
2024-07-28 00:42:05 -04:00
}
}
2025-01-03 10:18:53 +02:00
return true ;
} ;
}
2024-07-28 00:42:05 -04:00
2025-02-07 15:48:47 +02:00
llama_model * model = new llama_model ( params ) ;
2025-01-03 10:18:53 +02:00
// create list of devices to use with this model
if ( params . devices ) {
for ( ggml_backend_dev_t * dev = params . devices ; * dev ; + + dev ) {
model - > devices . push_back ( * dev ) ;
}
} else {
2025-01-26 23:20:34 +08:00
std : : vector < ggml_backend_dev_t > rpc_servers ;
2025-01-03 10:18:53 +02:00
// use all available devices
for ( size_t i = 0 ; i < ggml_backend_dev_count ( ) ; + + i ) {
ggml_backend_dev_t dev = ggml_backend_dev_get ( i ) ;
switch ( ggml_backend_dev_type ( dev ) ) {
case GGML_BACKEND_DEVICE_TYPE_CPU :
case GGML_BACKEND_DEVICE_TYPE_ACCEL :
// skip CPU backends since they are handled separately
break ;
2023-10-03 21:04:01 +03:00
2025-01-03 10:18:53 +02:00
case GGML_BACKEND_DEVICE_TYPE_GPU :
2025-01-26 23:20:34 +08:00
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg ( dev ) ;
if ( ggml_backend_reg_name ( reg ) = = std : : string ( " RPC " ) ) {
rpc_servers . push_back ( dev ) ;
} else {
model - > devices . push_back ( dev ) ;
}
2025-01-03 10:18:53 +02:00
break ;
2024-07-28 00:42:05 -04:00
}
}
2025-01-26 23:20:34 +08:00
// add RPC servers at the front of the list
if ( ! rpc_servers . empty ( ) ) {
model - > devices . insert ( model - > devices . begin ( ) , rpc_servers . begin ( ) , rpc_servers . end ( ) ) ;
}
2024-07-28 00:42:05 -04:00
}
2023-10-03 21:04:01 +03:00
2025-01-03 10:18:53 +02:00
// if using single GPU mode, remove all except the main GPU
if ( params . split_mode = = LLAMA_SPLIT_MODE_NONE ) {
2025-06-16 08:11:43 -07:00
if ( params . main_gpu < 0 ) {
model - > devices . clear ( ) ;
} else {
if ( params . main_gpu > = ( int ) model - > devices . size ( ) ) {
LLAMA_LOG_ERROR ( " %s: invalid value for main_gpu: %d (available devices: %zu) \n " , __func__ , params . main_gpu , model - > devices . size ( ) ) ;
llama_model_free ( model ) ;
return nullptr ;
}
ggml_backend_dev_t main_gpu = model - > devices [ params . main_gpu ] ;
model - > devices . clear ( ) ;
model - > devices . push_back ( main_gpu ) ;
2025-01-03 10:18:53 +02:00
}
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
for ( auto * dev : model - > devices ) {
size_t free , total ; // NOLINT
ggml_backend_dev_memory ( dev , & free , & total ) ;
LLAMA_LOG_INFO ( " %s: using device %s (%s) - %zu MiB free \n " , __func__ , ggml_backend_dev_name ( dev ) , ggml_backend_dev_description ( dev ) , free / 1024 / 1024 ) ;
2024-08-08 23:54:00 -04:00
}
2025-01-16 13:54:08 +01:00
const int status = llama_model_load ( path_model , splits , * model , params ) ;
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( status < = 0 ) ;
if ( status < 0 ) {
if ( status = = - 1 ) {
LLAMA_LOG_ERROR ( " %s: failed to load model \n " , __func__ ) ;
} else if ( status = = - 2 ) {
LLAMA_LOG_INFO ( " %s: cancelled model load \n " , __func__ ) ;
}
2025-01-06 10:55:18 +02:00
llama_model_free ( model ) ;
2025-01-03 10:18:53 +02:00
return nullptr ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
return model ;
}
2024-07-28 00:42:05 -04:00
2025-01-16 13:54:08 +01:00
// deprecated
struct llama_model * llama_load_model_from_file (
const char * path_model ,
struct llama_model_params params ) {
return llama_model_load_from_file ( path_model , params ) ;
}
struct llama_model * llama_model_load_from_file (
const char * path_model ,
struct llama_model_params params ) {
std : : vector < std : : string > splits = { } ;
return llama_model_load_from_file_impl ( path_model , splits , params ) ;
}
struct llama_model * llama_model_load_from_splits (
const char * * paths ,
size_t n_paths ,
struct llama_model_params params ) {
std : : vector < std : : string > splits ;
if ( n_paths = = 0 ) {
LLAMA_LOG_ERROR ( " %s: list of splits is empty \n " , __func__ ) ;
return nullptr ;
}
for ( size_t i = 0 ; i < n_paths ; + + i ) {
splits . push_back ( paths [ i ] ) ;
}
return llama_model_load_from_file_impl ( splits . front ( ) , splits , params ) ;
}
2025-05-12 14:44:49 +02:00
void llama_model_save_to_file ( const struct llama_model * model , const char * path_model ) {
llama_model_saver ms ( * model ) ;
ms . add_kv_from_model ( ) ;
ms . add_tensors_from_model ( ) ;
ms . save ( path_model ) ;
}
2024-07-23 13:10:17 +03:00
//
// chat templates
//
2024-02-19 09:23:37 +01:00
2024-07-23 13:10:17 +03:00
int32_t llama_chat_apply_template (
2024-02-19 09:23:37 +01:00
const char * tmpl ,
const struct llama_chat_message * chat ,
size_t n_msg ,
bool add_ass ,
char * buf ,
int32_t length ) {
2025-01-12 11:32:42 +02:00
const std : : string curr_tmpl ( tmpl = = nullptr ? " chatml " : tmpl ) ;
2024-03-07 11:41:53 +02:00
2024-02-19 09:23:37 +01:00
// format the chat to string
std : : vector < const llama_chat_message * > chat_vec ;
chat_vec . resize ( n_msg ) ;
for ( size_t i = 0 ; i < n_msg ; i + + ) {
chat_vec [ i ] = & chat [ i ] ;
}
2024-03-07 11:41:53 +02:00
2024-02-19 09:23:37 +01:00
std : : string formatted_chat ;
2025-01-03 10:18:53 +02:00
llm_chat_template detected_tmpl = llm_chat_detect_template ( curr_tmpl ) ;
2024-12-02 22:10:19 +01:00
if ( detected_tmpl = = LLM_CHAT_TEMPLATE_UNKNOWN ) {
return - 1 ;
}
2025-01-03 10:18:53 +02:00
int32_t res = llm_chat_apply_template ( detected_tmpl , chat_vec , formatted_chat , add_ass ) ;
2024-02-19 09:23:37 +01:00
if ( res < 0 ) {
return res ;
}
2024-03-07 11:41:53 +02:00
if ( buf & & length > 0 ) {
strncpy ( buf , formatted_chat . c_str ( ) , length ) ;
}
2024-02-19 09:23:37 +01:00
return res ;
}
2024-07-23 13:10:17 +03:00
//
2024-09-07 15:16:19 +03:00
// model split
2024-07-23 13:10:17 +03:00
//
int llama_split_path ( char * split_path , size_t maxlen , const char * path_prefix , int split_no , int split_count ) {
2024-03-22 19:00:01 +01:00
static const char * const SPLIT_PATH_FORMAT = " %s-%05d-of-%05d.gguf " ;
if ( snprintf ( split_path , maxlen , SPLIT_PATH_FORMAT , path_prefix , split_no + 1 , split_count ) ) {
return strlen ( split_path ) ;
}
return 0 ;
}
2025-01-06 17:52:35 +02:00
int llama_split_prefix ( char * split_prefix , size_t maxlen , const char * split_path , int split_no , int split_count ) {
2024-03-22 19:00:01 +01:00
std : : string str_split_path ( split_path ) ;
char postfix [ 32 ] ;
snprintf ( postfix , 32 , " -%05d-of-%05d.gguf " , split_no + 1 , split_count ) ;
std : : string str_postfix ( postfix ) ;
2025-01-06 17:52:35 +02:00
// check if split_prefix ends with postfix
2024-03-22 19:00:01 +01:00
int size_prefix = str_split_path . size ( ) - str_postfix . size ( ) ;
if ( size_prefix > 0 & & str_split_path . find ( str_postfix , size_prefix ) ! = std : : string : : npos ) {
2025-01-06 17:52:35 +02:00
snprintf ( split_prefix , std : : min ( ( size_t ) size_prefix + 1 , maxlen ) , " %s " , split_path ) ;
2024-03-22 19:00:01 +01:00
return size_prefix ;
}
return 0 ;
}
2023-03-22 07:32:36 +02:00
const char * llama_print_system_info ( void ) {
static std : : string s ;
2025-01-06 12:21:46 +01:00
s . clear ( ) ; // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
2024-11-25 15:13:39 +01:00
for ( size_t i = 0 ; i < ggml_backend_reg_count ( ) ; i + + ) {
auto * reg = ggml_backend_reg_get ( i ) ;
auto * get_features_fn = ( ggml_backend_get_features_t ) ggml_backend_reg_get_proc_address ( reg , " ggml_backend_get_features " ) ;
if ( get_features_fn ) {
ggml_backend_feature * features = get_features_fn ( reg ) ;
s + = ggml_backend_reg_name ( reg ) ;
s + = " : " ;
for ( ; features - > name ; features + + ) {
s + = features - > name ;
s + = " = " ;
s + = features - > value ;
s + = " | " ;
}
}
}
2023-03-22 07:32:36 +02:00
return s . c_str ( ) ;
}
2025-05-12 14:44:49 +02:00