2026-01-16 14:21:48 +08:00
# if defined(_MSC_VER)
# define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
# endif
# include "ggml.h"
# include "gguf.h"
# include "common.h"
# include "log.h"
# include "llama.h"
# include "sampling.h"
# include <algorithm>
# include <cinttypes>
# include <climits>
# include <cmath>
# include <codecvt>
# include <chrono>
# include <cstdarg>
# include <cstring>
# include <ctime>
# include <filesystem>
# include <fstream>
# include <iostream>
# include <iterator>
# include <regex>
# include <sstream>
# include <string>
# include <thread>
# include <unordered_set>
# include <vector>
# if defined(__APPLE__) && defined(__MACH__)
# include <sys/types.h>
# include <sys/sysctl.h>
# endif
# if defined(_WIN32)
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <locale>
# include <windows.h>
# include <string.h>
# include <fcntl.h>
# include <io.h>
# else
# include <sys/ioctl.h>
# include <sys/stat.h>
# include <unistd.h>
# endif
# if defined(__linux__)
# include <sys/types.h>
# include <pwd.h>
# endif
# if defined(_MSC_VER)
# pragma warning(disable: 4244 4267) // possible loss of data
# endif
common_time_meas : : common_time_meas ( int64_t & t_acc , bool disable ) : t_start_us ( disable ? - 1 : ggml_time_us ( ) ) , t_acc ( t_acc ) { }
common_time_meas : : ~ common_time_meas ( ) {
if ( t_start_us > = 0 ) {
t_acc + = ggml_time_us ( ) - t_start_us ;
}
}
//
// CPU utils
//
int32_t cpu_get_num_physical_cores ( ) {
# ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std : : unordered_set < std : : string > siblings ;
for ( uint32_t cpu = 0 ; cpu < UINT32_MAX ; + + cpu ) {
std : : ifstream thread_siblings ( " /sys/devices/system/cpu/cpu "
+ std : : to_string ( cpu ) + " /topology/thread_siblings " ) ;
if ( ! thread_siblings . is_open ( ) ) {
break ; // no more cpus
}
std : : string line ;
if ( std : : getline ( thread_siblings , line ) ) {
siblings . insert ( line ) ;
}
}
if ( ! siblings . empty ( ) ) {
return static_cast < int32_t > ( siblings . size ( ) ) ;
}
# elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores ;
size_t len = sizeof ( num_physical_cores ) ;
int result = sysctlbyname ( " hw.perflevel0.physicalcpu " , & num_physical_cores , & len , NULL , 0 ) ;
if ( result = = 0 ) {
return num_physical_cores ;
}
result = sysctlbyname ( " hw.physicalcpu " , & num_physical_cores , & len , NULL , 0 ) ;
if ( result = = 0 ) {
return num_physical_cores ;
}
# elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
unsigned int n_threads_win = std : : thread : : hardware_concurrency ( ) ;
unsigned int default_threads = n_threads_win > 0 ? ( n_threads_win < = 4 ? n_threads_win : n_threads_win / 2 ) : 4 ;
DWORD buffer_size = 0 ;
if ( ! GetLogicalProcessorInformationEx ( RelationProcessorCore , nullptr , & buffer_size ) ) {
if ( GetLastError ( ) ! = ERROR_INSUFFICIENT_BUFFER ) {
return default_threads ;
}
}
std : : vector < char > buffer ( buffer_size ) ;
if ( ! GetLogicalProcessorInformationEx ( RelationProcessorCore , reinterpret_cast < PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX > ( buffer . data ( ) ) , & buffer_size ) ) {
return default_threads ;
}
int32_t num_physical_cores = 0 ;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast < PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX > ( buffer . data ( ) ) ;
while ( buffer_size > 0 ) {
if ( info - > Relationship = = RelationProcessorCore ) {
num_physical_cores + = info - > Processor . GroupCount ;
}
buffer_size - = info - > Size ;
info = reinterpret_cast < PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX > ( reinterpret_cast < char * > ( info ) + info - > Size ) ;
}
return num_physical_cores > 0 ? num_physical_cores : default_threads ;
# endif
unsigned int n_threads = std : : thread : : hardware_concurrency ( ) ;
return n_threads > 0 ? ( n_threads < = 4 ? n_threads : n_threads / 2 ) : 4 ;
}
# if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
# include <pthread.h>
static void cpuid ( unsigned leaf , unsigned subleaf ,
unsigned * eax , unsigned * ebx , unsigned * ecx , unsigned * edx ) {
__asm__ ( " movq \t %%rbx,%%rsi \n \t "
" cpuid \n \t "
" xchgq \t %%rbx,%%rsi "
: " =a " ( * eax ) , " =S " ( * ebx ) , " =c " ( * ecx ) , " =d " ( * edx )
: " 0 " ( leaf ) , " 2 " ( subleaf ) ) ;
}
static int pin_cpu ( int cpu ) {
cpu_set_t mask ;
CPU_ZERO ( & mask ) ;
CPU_SET ( cpu , & mask ) ;
return pthread_setaffinity_np ( pthread_self ( ) , sizeof ( mask ) , & mask ) ;
}
static bool is_hybrid_cpu ( void ) {
unsigned eax , ebx , ecx , edx ;
cpuid ( 7 , 0 , & eax , & ebx , & ecx , & edx ) ;
return ! ! ( edx & ( 1u < < 15 ) ) ;
}
static bool is_running_on_efficiency_core ( void ) {
unsigned eax , ebx , ecx , edx ;
cpuid ( 0x1a , 0 , & eax , & ebx , & ecx , & edx ) ;
int intel_atom = 0x20 ;
int core_type = ( eax & 0xff000000u ) > > 24 ;
return core_type = = intel_atom ;
}
static int cpu_count_math_cpus ( int n_cpu ) {
int result = 0 ;
for ( int cpu = 0 ; cpu < n_cpu ; + + cpu ) {
if ( pin_cpu ( cpu ) ) {
return - 1 ;
}
if ( is_running_on_efficiency_core ( ) ) {
continue ; // efficiency cores harm lockstep threading
}
+ + cpu ; // hyperthreading isn't useful for linear algebra
+ + result ;
}
return result ;
}
# endif // __x86_64__ && __linux__
/**
* Returns number of CPUs on system that are useful for math .
*/
int32_t cpu_get_num_math ( ) {
# if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int n_cpu = sysconf ( _SC_NPROCESSORS_ONLN ) ;
if ( n_cpu < 1 ) {
return cpu_get_num_physical_cores ( ) ;
}
if ( is_hybrid_cpu ( ) ) {
cpu_set_t affinity ;
if ( ! pthread_getaffinity_np ( pthread_self ( ) , sizeof ( affinity ) , & affinity ) ) {
int result = cpu_count_math_cpus ( n_cpu ) ;
pthread_setaffinity_np ( pthread_self ( ) , sizeof ( affinity ) , & affinity ) ;
if ( result > 0 ) {
return result ;
}
}
}
# endif
return cpu_get_num_physical_cores ( ) ;
}
// Helper for setting process priority
# if defined(_WIN32)
bool set_process_priority ( enum ggml_sched_priority prio ) {
if ( prio = = GGML_SCHED_PRIO_NORMAL ) {
return true ;
}
DWORD p = NORMAL_PRIORITY_CLASS ;
switch ( prio ) {
case GGML_SCHED_PRIO_LOW : p = BELOW_NORMAL_PRIORITY_CLASS ; break ;
case GGML_SCHED_PRIO_NORMAL : p = NORMAL_PRIORITY_CLASS ; break ;
case GGML_SCHED_PRIO_MEDIUM : p = ABOVE_NORMAL_PRIORITY_CLASS ; break ;
case GGML_SCHED_PRIO_HIGH : p = HIGH_PRIORITY_CLASS ; break ;
case GGML_SCHED_PRIO_REALTIME : p = REALTIME_PRIORITY_CLASS ; break ;
}
if ( ! SetPriorityClass ( GetCurrentProcess ( ) , p ) ) {
LOG_WRN ( " failed to set process priority class %d : (%d) \n " , prio , ( int ) GetLastError ( ) ) ;
return false ;
}
return true ;
}
# else // MacOS and POSIX
# include <sys/types.h>
# include <sys/resource.h>
bool set_process_priority ( enum ggml_sched_priority prio ) {
if ( prio = = GGML_SCHED_PRIO_NORMAL ) {
return true ;
}
int p = 0 ;
switch ( prio ) {
case GGML_SCHED_PRIO_LOW : p = 5 ; break ;
case GGML_SCHED_PRIO_NORMAL : p = 0 ; break ;
case GGML_SCHED_PRIO_MEDIUM : p = - 5 ; break ;
case GGML_SCHED_PRIO_HIGH : p = - 10 ; break ;
case GGML_SCHED_PRIO_REALTIME : p = - 20 ; break ;
}
2026-01-16 18:12:13 +08:00
if ( ! setpriority ( PRIO_PROCESS , 0 , p ) ) {
2026-01-16 14:21:48 +08:00
LOG_WRN ( " failed to set process priority %d : %s (%d) \n " , prio , strerror ( errno ) , errno ) ;
return false ;
}
return true ;
}
# endif
//
// CLI argument parsing
//
void postprocess_cpu_params ( cpu_params & cpuparams , const cpu_params * role_model ) {
int32_t n_set = 0 ;
if ( cpuparams . n_threads < 0 ) {
// Assuming everything about cpuparams is invalid
if ( role_model ! = nullptr ) {
cpuparams = * role_model ;
} else {
cpuparams . n_threads = cpu_get_num_math ( ) ;
}
}
for ( int32_t i = 0 ; i < GGML_MAX_N_THREADS ; i + + ) {
if ( cpuparams . cpumask [ i ] ) {
n_set + + ;
}
}
if ( n_set & & n_set < cpuparams . n_threads ) {
// Not enough set bits, may experience performance issues.
LOG_WRN ( " Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d \n " , n_set , cpuparams . n_threads ) ;
}
}
bool parse_cpu_range ( const std : : string & range , bool ( & boolmask ) [ GGML_MAX_N_THREADS ] ) {
size_t dash_loc = range . find ( ' - ' ) ;
if ( dash_loc = = std : : string : : npos ) {
LOG_ERR ( " Format of CPU range is invalid! Expected [<start>]-[<end>]. \n " ) ;
return false ;
}
size_t start_i ;
size_t end_i ;
if ( dash_loc = = 0 ) {
start_i = 0 ;
} else {
start_i = std : : stoull ( range . substr ( 0 , dash_loc ) ) ;
if ( start_i > = GGML_MAX_N_THREADS ) {
LOG_ERR ( " Start index out of bounds! \n " ) ;
return false ;
}
}
if ( dash_loc = = range . length ( ) - 1 ) {
end_i = GGML_MAX_N_THREADS - 1 ;
} else {
end_i = std : : stoull ( range . substr ( dash_loc + 1 ) ) ;
if ( end_i > = GGML_MAX_N_THREADS ) {
LOG_ERR ( " End index out of bounds! \n " ) ;
return false ;
}
}
for ( size_t i = start_i ; i < = end_i ; i + + ) {
boolmask [ i ] = true ;
}
return true ;
}
bool parse_cpu_mask ( const std : : string & mask , bool ( & boolmask ) [ GGML_MAX_N_THREADS ] ) {
// Discard potential 0x prefix
size_t start_i = 0 ;
if ( mask . length ( ) > = 2 & & mask . substr ( 0 , 2 ) = = " 0x " ) {
start_i = 2 ;
}
size_t num_digits = mask . length ( ) - start_i ;
if ( num_digits > 128 ) num_digits = 128 ;
size_t end_i = num_digits + start_i ;
for ( size_t i = start_i , n = ( num_digits * 4 - 1 ) ; i < end_i ; i + + , n - = 4 ) {
char c = mask . at ( i ) ;
int8_t id = c ;
if ( ( c > = ' 0 ' & & c < = ' 9 ' ) ) {
id - = ' 0 ' ;
} else if ( c > = ' a ' & & c < = ' f ' ) {
id - = ' a ' - 10 ;
} else if ( c > = ' A ' & & c < = ' F ' ) {
id - = ' A ' - 10 ;
} else {
LOG_ERR ( " Invalid hex character '%c' at position %d \n " , c , int32_t ( i ) ) ;
return false ;
}
boolmask [ n ] = boolmask [ n ] | | ( ( id & 8 ) ! = 0 ) ;
boolmask [ n - 1 ] = boolmask [ n - 1 ] | | ( ( id & 4 ) ! = 0 ) ;
boolmask [ n - 2 ] = boolmask [ n - 2 ] | | ( ( id & 2 ) ! = 0 ) ;
boolmask [ n - 3 ] = boolmask [ n - 3 ] | | ( ( id & 1 ) ! = 0 ) ;
}
return true ;
}
void common_init ( ) {
llama_log_set ( common_log_default_callback , NULL ) ;
# ifdef NDEBUG
const char * build_type = " " ;
# else
const char * build_type = " (debug) " ;
# endif
LOG_INF ( " build: %d (%s) with %s for %s%s \n " , LLAMA_BUILD_NUMBER , LLAMA_COMMIT , LLAMA_COMPILER , LLAMA_BUILD_TARGET , build_type ) ;
}
std : : string common_params_get_system_info ( const common_params & params ) {
std : : ostringstream os ;
os < < " system_info: n_threads = " < < params . cpuparams . n_threads ;
if ( params . cpuparams_batch . n_threads ! = - 1 ) {
os < < " (n_threads_batch = " < < params . cpuparams_batch . n_threads < < " ) " ;
}
# if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
DWORD logicalProcessorCount = GetActiveProcessorCount ( ALL_PROCESSOR_GROUPS ) ;
os < < " / " < < logicalProcessorCount < < " | " < < llama_print_system_info ( ) ;
# else
os < < " / " < < std : : thread : : hardware_concurrency ( ) < < " | " < < llama_print_system_info ( ) ;
# endif
return os . str ( ) ;
}
//
// String utils
//
std : : string string_format ( const char * fmt , . . . ) {
va_list ap ;
va_list ap2 ;
va_start ( ap , fmt ) ;
va_copy ( ap2 , ap ) ;
int size = vsnprintf ( NULL , 0 , fmt , ap ) ;
GGML_ASSERT ( size > = 0 & & size < INT_MAX ) ; // NOLINT
std : : vector < char > buf ( size + 1 ) ;
int size2 = vsnprintf ( buf . data ( ) , size + 1 , fmt , ap2 ) ;
GGML_ASSERT ( size2 = = size ) ;
va_end ( ap2 ) ;
va_end ( ap ) ;
return std : : string ( buf . data ( ) , size ) ;
}
std : : string string_strip ( const std : : string & str ) {
size_t start = 0 ;
size_t end = str . size ( ) ;
while ( start < end & & std : : isspace ( str [ start ] ) ) {
start + + ;
}
while ( end > start & & std : : isspace ( str [ end - 1 ] ) ) {
end - - ;
}
return str . substr ( start , end - start ) ;
}
std : : string string_get_sortable_timestamp ( ) {
using clock = std : : chrono : : system_clock ;
const clock : : time_point current_time = clock : : now ( ) ;
const time_t as_time_t = clock : : to_time_t ( current_time ) ;
char timestamp_no_ns [ 100 ] ;
std : : strftime ( timestamp_no_ns , 100 , " %Y_%m_%d-%H_%M_%S " , std : : localtime ( & as_time_t ) ) ;
const int64_t ns = std : : chrono : : duration_cast < std : : chrono : : nanoseconds > (
current_time . time_since_epoch ( ) % 1000000000 ) . count ( ) ;
char timestamp_ns [ 11 ] ;
snprintf ( timestamp_ns , 11 , " %09 " PRId64 , ns ) ;
return std : : string ( timestamp_no_ns ) + " . " + std : : string ( timestamp_ns ) ;
}
void string_replace_all ( std : : string & s , const std : : string & search , const std : : string & replace ) {
if ( search . empty ( ) ) {
return ;
}
std : : string builder ;
builder . reserve ( s . length ( ) ) ;
size_t pos = 0 ;
size_t last_pos = 0 ;
while ( ( pos = s . find ( search , last_pos ) ) ! = std : : string : : npos ) {
builder . append ( s , last_pos , pos - last_pos ) ;
builder . append ( replace ) ;
last_pos = pos + search . length ( ) ;
}
builder . append ( s , last_pos , std : : string : : npos ) ;
s = std : : move ( builder ) ;
}
bool string_ends_with ( const std : : string_view & str , const std : : string_view & suffix ) {
return str . size ( ) > = suffix . size ( ) & & str . compare ( str . size ( ) - suffix . size ( ) , suffix . size ( ) , suffix ) = = 0 ;
}
bool string_remove_suffix ( std : : string & str , const std : : string_view & suffix ) {
bool has_suffix = string_ends_with ( str , suffix ) ;
if ( has_suffix ) {
str = str . substr ( 0 , str . size ( ) - suffix . size ( ) ) ;
}
return has_suffix ;
}
size_t string_find_partial_stop ( const std : : string_view & str , const std : : string_view & stop ) {
if ( ! str . empty ( ) & & ! stop . empty ( ) ) {
const char text_last_char = str . back ( ) ;
for ( int64_t char_index = stop . size ( ) - 1 ; char_index > = 0 ; char_index - - ) {
if ( stop [ char_index ] = = text_last_char ) {
const auto current_partial = stop . substr ( 0 , char_index + 1 ) ;
if ( string_ends_with ( str , current_partial ) ) {
return str . size ( ) - char_index - 1 ;
}
}
}
}
return std : : string : : npos ;
}
std : : string regex_escape ( const std : : string & s ) {
static const std : : regex special_chars ( " [.^$|()*+? \\ [ \\ ]{} \\ \\ ] " ) ;
return std : : regex_replace ( s , special_chars , " \\ $& " ) ;
}
std : : string string_join ( const std : : vector < std : : string > & values , const std : : string & separator ) {
std : : ostringstream result ;
for ( size_t i = 0 ; i < values . size ( ) ; + + i ) {
if ( i > 0 ) {
result < < separator ;
}
result < < values [ i ] ;
}
return result . str ( ) ;
}
std : : vector < std : : string > string_split ( const std : : string & str , const std : : string & delimiter ) {
std : : vector < std : : string > parts ;
size_t start = 0 ;
size_t end = str . find ( delimiter ) ;
while ( end ! = std : : string : : npos ) {
parts . push_back ( str . substr ( start , end - start ) ) ;
start = end + delimiter . length ( ) ;
end = str . find ( delimiter , start ) ;
}
parts . push_back ( str . substr ( start ) ) ;
return parts ;
}
std : : string string_repeat ( const std : : string & str , size_t n ) {
if ( n = = 0 ) {
return " " ;
}
std : : string result ;
result . reserve ( str . length ( ) * n ) ;
for ( size_t i = 0 ; i < n ; + + i ) {
result + = str ;
}
return result ;
}
std : : string string_from ( bool value ) {
return value ? " true " : " false " ;
}
std : : string string_from ( const std : : vector < int > & values ) {
std : : stringstream buf ;
buf < < " [ " ;
bool first = true ;
for ( auto e : values ) {
if ( first ) {
first = false ;
} else {
buf < < " , " ;
}
buf < < std : : to_string ( e ) ;
}
buf < < " ] " ;
return buf . str ( ) ;
}
std : : string string_from ( const struct llama_context * ctx , const std : : vector < llama_token > & tokens ) {
std : : stringstream buf ;
buf < < " [ " ;
bool first = true ;
for ( const auto & token : tokens ) {
if ( ! first ) {
buf < < " , " ;
} else {
first = false ;
}
auto detokenized = common_token_to_piece ( ctx , token ) ;
buf < < " ' " < < detokenized < < " ' "
< < " : " < < std : : to_string ( token ) ;
}
buf < < " ] " ;
return buf . str ( ) ;
}
std : : string string_from ( const struct llama_context * ctx , const struct llama_batch & batch ) {
std : : stringstream buf ;
buf < < " [ " ;
bool first = true ;
for ( int i = 0 ; i < batch . n_tokens ; + + i ) {
if ( ! first ) {
buf < < " , " ;
} else {
first = false ;
}
auto detokenized = common_token_to_piece ( ctx , batch . token [ i ] ) ;
buf < < " \n " < < std : : to_string ( i )
< < " , token ' " < < detokenized < < " ' "
< < " , pos " < < std : : to_string ( batch . pos [ i ] )
< < " , n_seq_id " < < std : : to_string ( batch . n_seq_id [ i ] )
< < " , seq_id " < < std : : to_string ( batch . seq_id [ i ] [ 0 ] )
< < " , logits " < < std : : to_string ( batch . logits [ i ] ) ;
}
buf < < " ] " ;
return buf . str ( ) ;
}
void string_process_escapes ( std : : string & input ) {
std : : size_t input_len = input . length ( ) ;
std : : size_t output_idx = 0 ;
for ( std : : size_t input_idx = 0 ; input_idx < input_len ; + + input_idx ) {
if ( input [ input_idx ] = = ' \\ ' & & input_idx + 1 < input_len ) {
switch ( input [ + + input_idx ] ) {
case ' n ' : input [ output_idx + + ] = ' \n ' ; break ;
case ' r ' : input [ output_idx + + ] = ' \r ' ; break ;
case ' t ' : input [ output_idx + + ] = ' \t ' ; break ;
case ' \' ' : input [ output_idx + + ] = ' \' ' ; break ;
case ' \" ' : input [ output_idx + + ] = ' \" ' ; break ;
case ' \\ ' : input [ output_idx + + ] = ' \\ ' ; break ;
case ' x ' :
// Handle \x12, etc
if ( input_idx + 2 < input_len ) {
const char x [ 3 ] = { input [ input_idx + 1 ] , input [ input_idx + 2 ] , 0 } ;
char * err_p = nullptr ;
const long val = std : : strtol ( x , & err_p , 16 ) ;
if ( err_p = = x + 2 ) {
input_idx + = 2 ;
input [ output_idx + + ] = char ( val ) ;
break ;
}
}
// fall through
default : input [ output_idx + + ] = ' \\ ' ;
input [ output_idx + + ] = input [ input_idx ] ; break ;
}
} else {
input [ output_idx + + ] = input [ input_idx ] ;
}
}
input . resize ( output_idx ) ;
}
bool string_parse_kv_override ( const char * data , std : : vector < llama_model_kv_override > & overrides ) {
const char * sep = strchr ( data , ' = ' ) ;
if ( sep = = nullptr | | sep - data > = 128 ) {
LOG_ERR ( " %s: malformed KV override '%s' \n " , __func__ , data ) ;
return false ;
}
llama_model_kv_override kvo ;
std : : strncpy ( kvo . key , data , sep - data ) ;
kvo . key [ sep - data ] = 0 ;
sep + + ;
if ( strncmp ( sep , " int: " , 4 ) = = 0 ) {
sep + = 4 ;
kvo . tag = LLAMA_KV_OVERRIDE_TYPE_INT ;
kvo . val_i64 = std : : atol ( sep ) ;
} else if ( strncmp ( sep , " float: " , 6 ) = = 0 ) {
sep + = 6 ;
kvo . tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT ;
kvo . val_f64 = std : : atof ( sep ) ;
} else if ( strncmp ( sep , " bool: " , 5 ) = = 0 ) {
sep + = 5 ;
kvo . tag = LLAMA_KV_OVERRIDE_TYPE_BOOL ;
if ( std : : strcmp ( sep , " true " ) = = 0 ) {
kvo . val_bool = true ;
} else if ( std : : strcmp ( sep , " false " ) = = 0 ) {
kvo . val_bool = false ;
} else {
LOG_ERR ( " %s: invalid boolean value for KV override '%s' \n " , __func__ , data ) ;
return false ;
}
} else if ( strncmp ( sep , " str: " , 4 ) = = 0 ) {
sep + = 4 ;
kvo . tag = LLAMA_KV_OVERRIDE_TYPE_STR ;
if ( strlen ( sep ) > 127 ) {
LOG_ERR ( " %s: malformed KV override '%s', value cannot exceed 127 chars \n " , __func__ , data ) ;
return false ;
}
strncpy ( kvo . val_str , sep , 127 ) ;
kvo . val_str [ 127 ] = ' \0 ' ;
} else {
LOG_ERR ( " %s: invalid type for KV override '%s' \n " , __func__ , data ) ;
return false ;
}
overrides . emplace_back ( std : : move ( kvo ) ) ;
return true ;
}
//
// Filesystem utils
//
// Validate if a filename is safe to use
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
bool fs_validate_filename ( const std : : string & filename , bool allow_subdirs ) {
if ( ! filename . length ( ) ) {
// Empty filename invalid
return false ;
}
if ( filename . length ( ) > 255 ) {
// Limit at common largest possible filename on Linux filesystems
// to avoid unnecessary further validation
// (On systems with smaller limits it will be caught by the OS)
return false ;
}
std : : u32string filename_utf32 ;
try {
# if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
# elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
# endif
std : : wstring_convert < std : : codecvt_utf8 < char32_t > , char32_t > converter ;
# if defined(__clang__)
# pragma clang diagnostic pop
# elif defined(__GNUC__)
# pragma GCC diagnostic pop
# endif
filename_utf32 = converter . from_bytes ( filename ) ;
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
// or invalid encodings were encountered. Reject such attempts
std : : string filename_reencoded = converter . to_bytes ( filename_utf32 ) ;
if ( filename_reencoded ! = filename ) {
return false ;
}
} catch ( const std : : exception & ) {
return false ;
}
// Check for forbidden codepoints:
// - Control characters
// - Unicode equivalents of illegal characters
// - UTF-16 surrogate pairs
// - UTF-8 replacement character
// - Byte order mark (BOM)
// - Illegal characters: / \ : * ? " < > |
for ( char32_t c : filename_utf32 ) {
if ( c < = 0x1F // Control characters (C0)
| | c = = 0x7F // Control characters (DEL)
| | ( c > = 0x80 & & c < = 0x9F ) // Control characters (C1)
| | c = = 0xFF0E // Fullwidth Full Stop (period equivalent)
| | c = = 0x2215 // Division Slash (forward slash equivalent)
| | c = = 0x2216 // Set Minus (backslash equivalent)
| | ( c > = 0xD800 & & c < = 0xDFFF ) // UTF-16 surrogate pairs
| | c = = 0xFFFD // Replacement Character (UTF-8)
| | c = = 0xFEFF // Byte Order Mark (BOM)
| | c = = ' : ' | | c = = ' * ' // Illegal characters
| | c = = ' ? ' | | c = = ' " ' | | c = = ' < ' | | c = = ' > ' | | c = = ' | ' ) {
return false ;
}
if ( ! allow_subdirs & & ( c = = ' / ' | | c = = ' \\ ' ) ) {
// Subdirectories not allowed, reject path separators
return false ;
}
}
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
// Unicode and other whitespace is not affected, only 0x20 space
if ( filename . front ( ) = = ' ' | | filename . back ( ) = = ' ' | | filename . back ( ) = = ' . ' ) {
return false ;
}
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
if ( filename . find ( " .. " ) ! = std : : string : : npos ) {
return false ;
}
// Reject "."
if ( filename = = " . " ) {
return false ;
}
return true ;
}
# include <iostream>
# ifdef _WIN32
static std : : wstring utf8_to_wstring ( const std : : string & str ) {
if ( str . empty ( ) ) {
return std : : wstring ( ) ;
}
int size = MultiByteToWideChar ( CP_UTF8 , 0 , str . c_str ( ) , ( int ) str . size ( ) , NULL , 0 ) ;
if ( size < = 0 ) {
return std : : wstring ( ) ;
}
std : : wstring wstr ( size , 0 ) ;
MultiByteToWideChar ( CP_UTF8 , 0 , str . c_str ( ) , ( int ) str . size ( ) , & wstr [ 0 ] , size ) ;
return wstr ;
}
# endif
// returns true if successful, false otherwise
bool fs_create_directory_with_parents ( const std : : string & path ) {
# ifdef _WIN32
std : : wstring wpath = utf8_to_wstring ( path ) ;
// if the path already exists, check whether it's a directory
const DWORD attributes = GetFileAttributesW ( wpath . c_str ( ) ) ;
if ( ( attributes ! = INVALID_FILE_ATTRIBUTES ) & & ( attributes & FILE_ATTRIBUTE_DIRECTORY ) ) {
return true ;
}
size_t pos_slash = 0 ;
// process path from front to back, procedurally creating directories
while ( ( pos_slash = path . find ( ' \\ ' , pos_slash ) ) ! = std : : string : : npos ) {
const std : : wstring subpath = wpath . substr ( 0 , pos_slash ) ;
pos_slash + = 1 ;
// skip the drive letter, in some systems it can return an access denied error
if ( subpath . length ( ) = = 2 & & subpath [ 1 ] = = ' : ' ) {
continue ;
}
const bool success = CreateDirectoryW ( subpath . c_str ( ) , NULL ) ;
if ( ! success ) {
const DWORD error = GetLastError ( ) ;
// if the path already exists, ensure that it's a directory
if ( error = = ERROR_ALREADY_EXISTS ) {
const DWORD attributes = GetFileAttributesW ( subpath . c_str ( ) ) ;
if ( attributes = = INVALID_FILE_ATTRIBUTES | | ! ( attributes & FILE_ATTRIBUTE_DIRECTORY ) ) {
return false ;
}
} else {
return false ;
}
}
}
return true ;
# else
// if the path already exists, check whether it's a directory
struct stat info ;
if ( stat ( path . c_str ( ) , & info ) = = 0 ) {
return S_ISDIR ( info . st_mode ) ;
}
size_t pos_slash = 1 ; // skip leading slashes for directory creation
// process path from front to back, procedurally creating directories
while ( ( pos_slash = path . find ( ' / ' , pos_slash ) ) ! = std : : string : : npos ) {
const std : : string subpath = path . substr ( 0 , pos_slash ) ;
struct stat info ;
// if the path already exists, ensure that it's a directory
if ( stat ( subpath . c_str ( ) , & info ) = = 0 ) {
if ( ! S_ISDIR ( info . st_mode ) ) {
return false ;
}
} else {
// create parent directories
const int ret = mkdir ( subpath . c_str ( ) , 0755 ) ;
if ( ret ! = 0 ) {
return false ;
}
}
pos_slash + = 1 ;
}
return true ;
# endif // _WIN32
}
bool fs_is_directory ( const std : : string & path ) {
std : : filesystem : : path dir ( path ) ;
return std : : filesystem : : exists ( dir ) & & std : : filesystem : : is_directory ( dir ) ;
}
std : : string fs_get_cache_directory ( ) {
std : : string cache_directory = " " ;
auto ensure_trailing_slash = [ ] ( std : : string p ) {
// Make sure to add trailing slash
if ( p . back ( ) ! = DIRECTORY_SEPARATOR ) {
p + = DIRECTORY_SEPARATOR ;
}
return p ;
} ;
if ( getenv ( " LLAMA_CACHE " ) ) {
cache_directory = std : : getenv ( " LLAMA_CACHE " ) ;
} else {
# if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if ( std : : getenv ( " XDG_CACHE_HOME " ) ) {
cache_directory = std : : getenv ( " XDG_CACHE_HOME " ) ;
} else if ( std : : getenv ( " HOME " ) ) {
cache_directory = std : : getenv ( " HOME " ) + std : : string ( " /.cache/ " ) ;
} else {
# if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd * pw = getpwuid ( getuid ( ) ) ;
if ( ( ! pw ) | | ( ! pw - > pw_dir ) ) {
throw std : : runtime_error ( " Failed to find $HOME directory " ) ;
}
cache_directory = std : : string ( pw - > pw_dir ) + std : : string ( " /.cache/ " ) ;
# else /* defined(__linux__) */
throw std : : runtime_error ( " Failed to find $HOME directory " ) ;
# endif /* defined(__linux__) */
}
# elif defined(__APPLE__)
cache_directory = std : : getenv ( " HOME " ) + std : : string ( " /Library/Caches/ " ) ;
# elif defined(_WIN32)
cache_directory = std : : getenv ( " LOCALAPPDATA " ) ;
# elif defined(__EMSCRIPTEN__)
GGML_ABORT ( " not implemented on this platform " ) ;
# else
# error Unknown architecture
# endif
cache_directory = ensure_trailing_slash ( cache_directory ) ;
cache_directory + = " llama.cpp " ;
}
return ensure_trailing_slash ( cache_directory ) ;
}
std : : string fs_get_cache_file ( const std : : string & filename ) {
GGML_ASSERT ( filename . find ( DIRECTORY_SEPARATOR ) = = std : : string : : npos ) ;
std : : string cache_directory = fs_get_cache_directory ( ) ;
const bool success = fs_create_directory_with_parents ( cache_directory ) ;
if ( ! success ) {
throw std : : runtime_error ( " failed to create cache directory: " + cache_directory ) ;
}
return cache_directory + filename ;
}
std : : vector < common_file_info > fs_list ( const std : : string & path , bool include_directories ) {
std : : vector < common_file_info > files ;
if ( path . empty ( ) ) return files ;
std : : filesystem : : path dir ( path ) ;
if ( ! std : : filesystem : : exists ( dir ) | | ! std : : filesystem : : is_directory ( dir ) ) {
return files ;
}
for ( const auto & entry : std : : filesystem : : directory_iterator ( dir ) ) {
try {
// Only include regular files (skip directories)
const auto & p = entry . path ( ) ;
if ( std : : filesystem : : is_regular_file ( p ) ) {
common_file_info info ;
info . path = p . string ( ) ;
info . name = p . filename ( ) . string ( ) ;
info . is_dir = false ;
try {
info . size = static_cast < size_t > ( std : : filesystem : : file_size ( p ) ) ;
} catch ( const std : : filesystem : : filesystem_error & ) {
info . size = 0 ;
}
files . push_back ( std : : move ( info ) ) ;
} else if ( include_directories & & std : : filesystem : : is_directory ( p ) ) {
common_file_info info ;
info . path = p . string ( ) ;
info . name = p . filename ( ) . string ( ) ;
info . size = 0 ; // Directories have no size
info . is_dir = true ;
files . push_back ( std : : move ( info ) ) ;
}
} catch ( const std : : filesystem : : filesystem_error & ) {
// skip entries we cannot inspect
continue ;
}
}
return files ;
}
//
// TTY utils
//
bool tty_can_use_colors ( ) {
// Check NO_COLOR environment variable (https://no-color.org/)
if ( const char * no_color = std : : getenv ( " NO_COLOR " ) ) {
if ( no_color [ 0 ] ! = ' \0 ' ) {
return false ;
}
}
// Check TERM environment variable
if ( const char * term = std : : getenv ( " TERM " ) ) {
if ( std : : strcmp ( term , " dumb " ) = = 0 ) {
return false ;
}
}
// Check if stdout and stderr are connected to a terminal
// We check both because log messages can go to either
bool stdout_is_tty = isatty ( fileno ( stdout ) ) ;
bool stderr_is_tty = isatty ( fileno ( stderr ) ) ;
return stdout_is_tty | | stderr_is_tty ;
}
//
// Model utils
//
// TODO: move to common/sampling
static void common_init_sampler_from_model (
const llama_model * model ,
common_params_sampling & sparams ) {
const uint64_t config = sparams . user_sampling_config ;
auto get_int32 = [ & ] ( const char * key , int32_t & dst , uint64_t user_config ) {
if ( config & user_config ) {
return ;
}
char buf [ 64 ] = { 0 } ;
if ( llama_model_meta_val_str ( model , key , buf , sizeof ( buf ) ) > 0 ) {
char * end = nullptr ;
int32_t v = strtol ( buf , & end , 10 ) ;
if ( end & & end ! = buf ) {
dst = v ;
}
}
} ;
auto get_float = [ & ] ( const char * key , float & dst , uint64_t user_config ) {
if ( config & user_config ) {
return ;
}
char buf [ 128 ] = { 0 } ;
if ( llama_model_meta_val_str ( model , key , buf , sizeof ( buf ) ) > 0 ) {
char * end = nullptr ;
float v = strtof ( buf , & end ) ;
if ( end & & end ! = buf ) {
dst = v ;
}
}
} ;
// Sampling sequence
if ( ! ( config & common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS ) ) {
char buf [ 512 ] = { 0 } ;
if ( llama_model_meta_val_str ( model , llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE ) , buf , sizeof ( buf ) ) > 0 ) {
const std : : vector < std : : string > sampler_names = string_split < std : : string > ( std : : string ( buf ) , ' ; ' ) ;
if ( ! sampler_names . empty ( ) ) {
sparams . samplers = common_sampler_types_from_names ( sampler_names , true ) ;
}
}
}
get_int32 ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_TOP_K ) , sparams . top_k , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_TOP_K ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_TOP_P ) , sparams . top_p , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_TOP_P ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_MIN_P ) , sparams . min_p , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_MIN_P ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY ) , sparams . xtc_probability , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD ) , sparams . xtc_threshold , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_TEMP ) , sparams . temp , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_TEMP ) ;
get_int32 ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N ) , sparams . penalty_last_n , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT ) , sparams . penalty_repeat , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT ) ;
get_int32 ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT ) , sparams . mirostat , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU ) , sparams . mirostat_tau , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU ) ;
get_float ( llama_model_meta_key_str ( LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA ) , sparams . mirostat_eta , common_params_sampling_config : : COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA ) ;
}
struct common_init_result : : impl {
impl ( ) = default ;
~ impl ( ) = default ;
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
llama_model_ptr model ;
llama_context_ptr context ;
std : : vector < llama_adapter_lora_ptr > lora ;
std : : vector < common_sampler_ptr > samplers ;
} ;
common_init_result : : common_init_result ( common_params & params ) :
pimpl ( new impl { } ) {
auto mparams = common_model_params_to_llama ( params ) ;
auto cparams = common_context_params_to_llama ( params ) ;
if ( params . fit_params ) {
LOG_INF ( " %s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on \n " , __func__ ) ;
llama_params_fit ( params . model . path . c_str ( ) , & mparams , & cparams ,
2026-01-16 18:12:13 +08:00
params . tensor_split , params . tensor_buft_overrides . data ( ) , params . fit_params_target , params . fit_params_min_ctx ,
2026-01-16 14:21:48 +08:00
params . verbosity > = 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR ) ;
}
llama_model * model = llama_model_load_from_file ( params . model . path . c_str ( ) , mparams ) ;
if ( model = = NULL ) {
return ;
}
pimpl - > model . reset ( model ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
// updates params.sampling
// TODO: fix naming
common_init_sampler_from_model ( model , params . sampling ) ;
if ( params . sampling . ignore_eos & & llama_vocab_eos ( vocab ) = = LLAMA_TOKEN_NULL ) {
LOG_WRN ( " %s: warning: vocab does not have an EOS token, ignoring --ignore-eos \n " , __func__ ) ;
params . sampling . ignore_eos = false ;
}
// initialize once
for ( llama_token i = 0 ; i < llama_vocab_n_tokens ( vocab ) ; i + + ) {
if ( llama_vocab_is_eog ( vocab , i ) ) {
LOG_INF ( " %s: added %s logit bias = %f \n " , __func__ , common_token_to_piece ( vocab , i ) . c_str ( ) , - INFINITY ) ;
params . sampling . logit_bias_eog . push_back ( { i , - INFINITY } ) ;
}
}
if ( params . sampling . ignore_eos ) {
// add EOG biases to the active set of logit biases
params . sampling . logit_bias . insert (
params . sampling . logit_bias . end ( ) ,
params . sampling . logit_bias_eog . begin ( ) , params . sampling . logit_bias_eog . end ( ) ) ;
}
//if (params.sampling.penalty_last_n == -1) {
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
//}
//if (params.sampling.dry_penalty_last_n == -1) {
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
//}
pimpl - > samplers . resize ( cparams . n_seq_max ) ;
for ( int i = 0 ; i < ( int ) cparams . n_seq_max ; + + i ) {
pimpl - > samplers [ i ] . reset ( common_sampler_init ( model , params . sampling ) ) ;
}
llama_context * lctx = llama_init_from_model ( model , cparams ) ;
if ( lctx = = NULL ) {
LOG_ERR ( " %s: failed to create context with model '%s' \n " , __func__ , params . model . path . c_str ( ) ) ;
return ;
}
pimpl - > context . reset ( lctx ) ;
}
llama_model * common_init_result : : model ( ) {
return pimpl - > model . get ( ) ;
}
llama_context * common_init_result : : context ( ) {
return pimpl - > context . get ( ) ;
}
common_sampler * common_init_result : : sampler ( llama_seq_id seq_id ) {
return pimpl - > samplers [ seq_id ] . get ( ) ;
}
std : : vector < llama_adapter_lora_ptr > & common_init_result : : lora ( ) {
return pimpl - > lora ;
}
void common_init_result : : free_context ( ) {
pimpl - > context . reset ( ) ;
}
common_init_result_ptr common_init_from_params ( common_params & params ) {
common_init_result_ptr res ( new common_init_result ( params ) ) ;
llama_model * model = res - > model ( ) ;
if ( model = = NULL ) {
LOG_ERR ( " %s: failed to load model '%s' \n " , __func__ , params . model . path . c_str ( ) ) ;
return res ;
}
llama_context * lctx = res - > context ( ) ;
if ( lctx = = NULL ) {
LOG_ERR ( " %s: failed to create context with model '%s' \n " , __func__ , params . model . path . c_str ( ) ) ;
return res ;
}
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
if ( params . ctx_shift & & ! llama_memory_can_shift ( llama_get_memory ( lctx ) ) ) {
LOG_WRN ( " %s: KV cache shifting is not supported for this context, disabling KV cache shifting \n " , __func__ ) ;
params . ctx_shift = false ;
}
if ( ! params . control_vectors . empty ( ) ) {
if ( params . control_vector_layer_start < = 0 ) params . control_vector_layer_start = 1 ;
if ( params . control_vector_layer_end < = 0 ) params . control_vector_layer_end = llama_model_n_layer ( model ) ;
const auto cvec = common_control_vector_load ( params . control_vectors ) ;
if ( cvec . n_embd = = - 1 ) {
return res ;
}
int err = llama_apply_adapter_cvec (
lctx ,
cvec . data . data ( ) ,
cvec . data . size ( ) ,
cvec . n_embd ,
params . control_vector_layer_start ,
params . control_vector_layer_end ) ;
if ( err ) {
return res ;
}
}
if ( llama_pooling_type ( lctx ) = = LLAMA_POOLING_TYPE_RANK ) {
bool ok = true ;
if ( llama_vocab_bos ( vocab ) = = LLAMA_TOKEN_NULL ) {
LOG_WRN ( " %s: warning: vocab does not have a BOS token, reranking will not work \n " , __func__ ) ;
ok = false ;
}
bool has_eos = llama_vocab_eos ( vocab ) ! = LLAMA_TOKEN_NULL ;
bool has_sep = llama_vocab_sep ( vocab ) ! = LLAMA_TOKEN_NULL ;
bool has_rerank_prompt = llama_model_chat_template ( model , " rerank " ) ! = NULL ;
if ( ! has_eos & & ! has_sep & & ! has_rerank_prompt ) {
LOG_WRN ( " %s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work \n " , __func__ ) ;
ok = false ;
} else if ( ! has_eos ) {
LOG_WRN ( " %s: warning: vocab does not have an EOS token, using SEP token as fallback \n " , __func__ ) ;
}
if ( ! ok ) {
return res ;
}
}
2026-01-16 18:12:13 +08:00
// load and optionally apply lora adapters
for ( auto & la : params . lora_adapters ) {
llama_adapter_lora_ptr lora ;
lora . reset ( llama_adapter_lora_init ( model , la . path . c_str ( ) ) ) ;
if ( lora = = nullptr ) {
LOG_ERR ( " %s: failed to apply lora adapter '%s' \n " , __func__ , la . path . c_str ( ) ) ;
return res ;
}
char buf [ 1024 ] ;
la . ptr = lora . get ( ) ;
llama_adapter_meta_val_str ( la . ptr , " adapter.lora.task_name " , buf , sizeof ( buf ) ) ;
la . task_name = buf ;
llama_adapter_meta_val_str ( la . ptr , " adapter.lora.prompt_prefix " , buf , sizeof ( buf ) ) ;
la . prompt_prefix = buf ;
res - > lora ( ) . emplace_back ( std : : move ( lora ) ) ; // copy to list of loaded adapters
}
2026-01-16 14:21:48 +08:00
if ( ! params . lora_init_without_apply ) {
common_set_adapter_lora ( lctx , params . lora_adapters ) ;
}
if ( params . warmup ) {
LOG_WRN ( " %s: warming up the model with an empty run - please wait ... (--no-warmup to disable) \n " , __func__ ) ;
llama_set_warmup ( lctx , true ) ;
std : : vector < llama_token > tmp ;
llama_token bos = llama_vocab_bos ( vocab ) ;
llama_token eos = llama_vocab_eos ( vocab ) ;
// some models (e.g. T5) don't have a BOS token
if ( bos ! = LLAMA_TOKEN_NULL ) {
tmp . push_back ( bos ) ;
}
if ( eos ! = LLAMA_TOKEN_NULL ) {
tmp . push_back ( eos ) ;
}
if ( tmp . empty ( ) ) {
tmp . push_back ( 0 ) ;
}
if ( llama_model_has_encoder ( model ) ) {
llama_encode ( lctx , llama_batch_get_one ( tmp . data ( ) , tmp . size ( ) ) ) ;
llama_token decoder_start_token_id = llama_model_decoder_start_token ( model ) ;
if ( decoder_start_token_id = = LLAMA_TOKEN_NULL ) {
decoder_start_token_id = bos ;
}
tmp . clear ( ) ;
tmp . push_back ( decoder_start_token_id ) ;
}
if ( llama_model_has_decoder ( model ) ) {
llama_decode ( lctx , llama_batch_get_one ( tmp . data ( ) , std : : min ( tmp . size ( ) , ( size_t ) params . n_batch ) ) ) ;
}
llama_memory_clear ( llama_get_memory ( lctx ) , true ) ;
llama_synchronize ( lctx ) ;
llama_perf_context_reset ( lctx ) ;
llama_set_warmup ( lctx , false ) ;
}
return res ;
}
common_init_result : : ~ common_init_result ( ) = default ;
std : : string get_model_endpoint ( ) {
const char * model_endpoint_env = getenv ( " MODEL_ENDPOINT " ) ;
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
const char * hf_endpoint_env = getenv ( " HF_ENDPOINT " ) ;
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env ;
std : : string model_endpoint = " https://huggingface.co/ " ;
if ( endpoint_env ) {
model_endpoint = endpoint_env ;
if ( model_endpoint . back ( ) ! = ' / ' ) {
model_endpoint + = ' / ' ;
}
}
return model_endpoint ;
}
void common_set_adapter_lora ( struct llama_context * ctx , std : : vector < common_adapter_lora_info > & lora ) {
llama_clear_adapter_lora ( ctx ) ;
for ( auto & la : lora ) {
if ( la . scale ! = 0.0f ) {
llama_set_adapter_lora ( ctx , la . ptr , la . scale ) ;
}
}
}
struct llama_model_params common_model_params_to_llama ( common_params & params ) {
auto mparams = llama_model_default_params ( ) ;
if ( ! params . devices . empty ( ) ) {
mparams . devices = params . devices . data ( ) ;
}
2026-01-16 18:12:13 +08:00
if ( params . n_gpu_layers ! = - 1 ) {
mparams . n_gpu_layers = params . n_gpu_layers ;
}
2026-01-16 14:21:48 +08:00
mparams . main_gpu = params . main_gpu ;
mparams . split_mode = params . split_mode ;
mparams . tensor_split = params . tensor_split ;
mparams . use_mmap = params . use_mmap ;
mparams . use_mlock = params . use_mlock ;
mparams . check_tensors = params . check_tensors ;
mparams . use_extra_bufts = ! params . no_extra_bufts ;
mparams . no_host = params . no_host ;
if ( params . kv_overrides . empty ( ) ) {
mparams . kv_overrides = NULL ;
} else {
GGML_ASSERT ( params . kv_overrides . back ( ) . key [ 0 ] = = 0 & & " KV overrides not terminated with empty key " ) ;
mparams . kv_overrides = params . kv_overrides . data ( ) ;
}
if ( params . tensor_buft_overrides . empty ( ) ) {
mparams . tensor_buft_overrides = NULL ;
} else {
GGML_ASSERT ( params . tensor_buft_overrides . back ( ) . pattern = = nullptr & & " Tensor buffer overrides not terminated with empty pattern " ) ;
mparams . tensor_buft_overrides = params . tensor_buft_overrides . data ( ) ;
}
mparams . progress_callback = params . load_progress_callback ;
mparams . progress_callback_user_data = params . load_progress_callback_user_data ;
return mparams ;
}
struct llama_context_params common_context_params_to_llama ( const common_params & params ) {
auto cparams = llama_context_default_params ( ) ;
cparams . n_ctx = params . n_ctx ;
cparams . n_seq_max = params . n_parallel ;
cparams . n_batch = params . n_batch ;
cparams . n_ubatch = params . n_ubatch ;
cparams . n_threads = params . cpuparams . n_threads ;
cparams . n_threads_batch = params . cpuparams_batch . n_threads = = - 1 ?
params . cpuparams . n_threads : params . cpuparams_batch . n_threads ;
cparams . embeddings = params . embedding ;
cparams . rope_scaling_type = params . rope_scaling_type ;
cparams . rope_freq_base = params . rope_freq_base ;
cparams . rope_freq_scale = params . rope_freq_scale ;
cparams . yarn_ext_factor = params . yarn_ext_factor ;
cparams . yarn_attn_factor = params . yarn_attn_factor ;
cparams . yarn_beta_fast = params . yarn_beta_fast ;
cparams . yarn_beta_slow = params . yarn_beta_slow ;
cparams . yarn_orig_ctx = params . yarn_orig_ctx ;
cparams . pooling_type = params . pooling_type ;
cparams . attention_type = params . attention_type ;
cparams . flash_attn_type = params . flash_attn_type ;
cparams . cb_eval = params . cb_eval ;
cparams . cb_eval_user_data = params . cb_eval_user_data ;
cparams . offload_kqv = ! params . no_kv_offload ;
cparams . no_perf = params . no_perf ;
cparams . op_offload = ! params . no_op_offload ;
cparams . swa_full = params . swa_full ;
cparams . kv_unified = params . kv_unified ;
cparams . type_k = params . cache_type_k ;
cparams . type_v = params . cache_type_v ;
return cparams ;
}
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params ( const cpu_params & params ) {
struct ggml_threadpool_params tpp ;
ggml_threadpool_params_init ( & tpp , params . n_threads ) ; // setup the defaults
if ( params . mask_valid ) {
std : : memcpy ( & tpp . cpumask , & params . cpumask , GGML_MAX_N_THREADS ) ;
}
tpp . prio = params . priority ;
tpp . poll = params . poll ;
tpp . strict_cpu = params . strict_cpu ;
return tpp ;
}
//
// Batch utils
//
void common_batch_clear ( struct llama_batch & batch ) {
batch . n_tokens = 0 ;
}
void common_batch_add (
struct llama_batch & batch ,
llama_token id ,
llama_pos pos ,
const std : : vector < llama_seq_id > & seq_ids ,
bool logits ) {
GGML_ASSERT ( batch . seq_id [ batch . n_tokens ] & & " llama_batch size exceeded " ) ;
batch . token [ batch . n_tokens ] = id ;
batch . pos [ batch . n_tokens ] = pos ;
batch . n_seq_id [ batch . n_tokens ] = seq_ids . size ( ) ;
for ( size_t i = 0 ; i < seq_ids . size ( ) ; + + i ) {
batch . seq_id [ batch . n_tokens ] [ i ] = seq_ids [ i ] ;
}
batch . logits [ batch . n_tokens ] = logits ;
batch . n_tokens + + ;
}
//
// Token utils
//
size_t common_lcp ( const llama_tokens & a , const llama_tokens & b ) {
size_t i ;
for ( i = 0 ; i < a . size ( ) & & i < b . size ( ) & & a [ i ] = = b [ i ] ; i + + ) { }
return i ;
}
size_t common_lcs ( const llama_tokens & a , const llama_tokens & b ) {
// check for empty sequences
if ( a . empty ( ) | | b . empty ( ) ) {
return 0 ;
}
// get the lengths of the input sequences
size_t a_len = a . size ( ) ;
size_t b_len = b . size ( ) ;
// initialize the maximum length of the longest common subsequence (LCS)
size_t max_length = 0 ;
// use two rows instead of a 2D matrix to optimize space
std : : vector < size_t > prev_row ( b_len + 1 , 0 ) ;
std : : vector < size_t > curr_row ( b_len + 1 , 0 ) ;
// iterate through the elements of a
for ( size_t i = 1 ; i < = a_len ; i + + ) {
// iterate through the elements of b
for ( size_t j = 1 ; j < = b_len ; j + + ) {
// if elements at the current positions match
if ( a [ i - 1 ] = = b [ j - 1 ] ) {
// if it's the first element of either sequences, set LCS length to 1
if ( i = = 1 | | j = = 1 ) {
curr_row [ j ] = 1 ;
} else {
// increment LCS length by 1 compared to the previous element
curr_row [ j ] = prev_row [ j - 1 ] + 1 ;
}
// update max_length if necessary
if ( curr_row [ j ] > max_length ) {
max_length = curr_row [ j ] ;
}
} else {
// reset LCS length if elements don't match
curr_row [ j ] = 0 ;
}
}
// update the previous row for the next iteration
prev_row = curr_row ;
}
// return the maximum length of the LCS
return max_length ;
}
//
// Vocab utils
//
std : : vector < llama_token > common_tokenize (
const struct llama_context * ctx ,
const std : : string & text ,
bool add_special ,
bool parse_special ) {
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
return common_tokenize ( vocab , text , add_special , parse_special ) ;
}
std : : vector < llama_token > common_tokenize (
const struct llama_vocab * vocab ,
const std : : string & text ,
bool add_special ,
bool parse_special ) {
// upper limit for the number of tokens
int n_tokens = text . length ( ) + 2 * add_special ;
std : : vector < llama_token > result ( n_tokens ) ;
n_tokens = llama_tokenize ( vocab , text . data ( ) , text . length ( ) , result . data ( ) , result . size ( ) , add_special , parse_special ) ;
if ( n_tokens = = std : : numeric_limits < int32_t > : : min ( ) ) {
throw std : : runtime_error ( " Tokenization failed: input text too large, tokenization result exceeds int32_t limit " ) ;
}
if ( n_tokens < 0 ) {
result . resize ( - n_tokens ) ;
int check = llama_tokenize ( vocab , text . data ( ) , text . length ( ) , result . data ( ) , result . size ( ) , add_special , parse_special ) ;
GGML_ASSERT ( check = = - n_tokens ) ;
} else {
result . resize ( n_tokens ) ;
}
return result ;
}
std : : string common_token_to_piece ( const struct llama_context * ctx , llama_token token , bool special ) {
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
return common_token_to_piece ( vocab , token , special ) ;
}
std : : string common_token_to_piece ( const struct llama_vocab * vocab , llama_token token , bool special ) {
std : : string piece ;
piece . resize ( piece . capacity ( ) ) ; // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece ( vocab , token , & piece [ 0 ] , piece . size ( ) , 0 , special ) ;
if ( n_chars < 0 ) {
piece . resize ( - n_chars ) ;
int check = llama_token_to_piece ( vocab , token , & piece [ 0 ] , piece . size ( ) , 0 , special ) ;
GGML_ASSERT ( check = = - n_chars ) ;
}
else {
piece . resize ( n_chars ) ;
}
return piece ;
}
std : : string common_detokenize ( const struct llama_context * ctx , const std : : vector < llama_token > & tokens , bool special ) {
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
return common_detokenize ( vocab , tokens , special ) ;
}
std : : string common_detokenize ( const struct llama_vocab * vocab , const std : : vector < llama_token > & tokens , bool special ) {
std : : string text ;
text . resize ( std : : max ( text . capacity ( ) , tokens . size ( ) ) ) ;
int32_t n_chars = llama_detokenize ( vocab , tokens . data ( ) , ( int32_t ) tokens . size ( ) , & text [ 0 ] , ( int32_t ) text . size ( ) , false , special ) ;
if ( n_chars < 0 ) {
text . resize ( - n_chars ) ;
n_chars = llama_detokenize ( vocab , tokens . data ( ) , ( int32_t ) tokens . size ( ) , & text [ 0 ] , ( int32_t ) text . size ( ) , false , special ) ;
GGML_ASSERT ( n_chars < = ( int32_t ) text . size ( ) ) ; // whitespace trimming is performed after per-token detokenization
}
text . resize ( n_chars ) ;
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return text ;
}
//
// Embedding utils
//
void common_embd_normalize ( const float * inp , float * out , int n , int embd_norm ) {
double sum = 0.0 ;
switch ( embd_norm ) {
case - 1 : // no normalisation
sum = 1.0 ;
break ;
case 0 : // max absolute
for ( int i = 0 ; i < n ; i + + ) {
if ( sum < std : : abs ( inp [ i ] ) ) {
sum = std : : abs ( inp [ i ] ) ;
}
}
sum / = 32760.0 ; // make an int16 range
break ;
case 2 : // euclidean
for ( int i = 0 ; i < n ; i + + ) {
sum + = inp [ i ] * inp [ i ] ;
}
sum = std : : sqrt ( sum ) ;
break ;
default : // p-norm (euclidean is p-norm p=2)
for ( int i = 0 ; i < n ; i + + ) {
sum + = std : : pow ( std : : abs ( inp [ i ] ) , embd_norm ) ;
}
sum = std : : pow ( sum , 1.0 / embd_norm ) ;
break ;
}
const float norm = sum > 0.0 ? 1.0 / sum : 0.0f ;
for ( int i = 0 ; i < n ; i + + ) {
out [ i ] = inp [ i ] * norm ;
}
}
float common_embd_similarity_cos ( const float * embd1 , const float * embd2 , int n ) {
double sum = 0.0 ;
double sum1 = 0.0 ;
double sum2 = 0.0 ;
for ( int i = 0 ; i < n ; i + + ) {
sum + = embd1 [ i ] * embd2 [ i ] ;
sum1 + = embd1 [ i ] * embd1 [ i ] ;
sum2 + = embd2 [ i ] * embd2 [ i ] ;
}
// Handle the case where one or both vectors are zero vectors
if ( sum1 = = 0.0 | | sum2 = = 0.0 ) {
if ( sum1 = = 0.0 & & sum2 = = 0.0 ) {
return 1.0f ; // two zero vectors are similar
}
return 0.0f ;
}
return sum / ( sqrt ( sum1 ) * sqrt ( sum2 ) ) ;
}
//
// Control vector utils
//
static common_control_vector_data common_control_vector_load_one ( const common_control_vector_load_info & load_info ) {
common_control_vector_data result = { - 1 , { } } ;
ggml_context * ctx = nullptr ;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false ,
/* .ctx = */ & ctx ,
} ;
struct gguf_context * ctx_gguf = gguf_init_from_file ( load_info . fname . c_str ( ) , meta_gguf_params ) ;
if ( ! ctx_gguf ) {
LOG_ERR ( " %s: failed to load control vector file from %s \n " , __func__ , load_info . fname . c_str ( ) ) ;
return result ;
}
int32_t n_tensors = gguf_get_n_tensors ( ctx_gguf ) ;
if ( n_tensors = = 0 ) {
LOG_WRN ( " %s: no direction tensors found in %s \n " , __func__ , load_info . fname . c_str ( ) ) ;
}
for ( int i = 0 ; i < n_tensors ; i + + ) {
std : : string name = gguf_get_tensor_name ( ctx_gguf , i ) ;
int layer_idx = - 1 ;
// split on '.'
size_t dotpos = name . find ( ' . ' ) ;
if ( dotpos ! = std : : string : : npos & & name . substr ( 0 , dotpos ) = = " direction " ) {
try {
layer_idx = std : : stoi ( name . substr ( dotpos + 1 ) ) ;
} catch ( . . . ) {
layer_idx = - 1 ;
}
}
if ( layer_idx < 0 ) {
LOG_ERR ( " %s: invalid/unparsable direction tensor layer index in %s \n " , __func__ , load_info . fname . c_str ( ) ) ;
result . n_embd = - 1 ;
break ;
} else if ( layer_idx = = 0 ) {
LOG_ERR ( " %s: invalid (zero) direction tensor layer index in %s \n " , __func__ , load_info . fname . c_str ( ) ) ;
result . n_embd = - 1 ;
break ;
}
struct ggml_tensor * tensor = ggml_get_tensor ( ctx , name . c_str ( ) ) ;
if ( tensor - > type ! = GGML_TYPE_F32 ) {
LOG_ERR ( " %s: invalid (non-F32) direction tensor type in %s \n " , __func__ , load_info . fname . c_str ( ) ) ;
result . n_embd = - 1 ;
break ;
}
if ( ggml_n_dims ( tensor ) ! = 1 ) {
LOG_ERR ( " %s: invalid (non-1D) direction tensor shape in %s \n " , __func__ , load_info . fname . c_str ( ) ) ;
result . n_embd = - 1 ;
break ;
}
if ( result . n_embd = = - 1 ) {
result . n_embd = ggml_nelements ( tensor ) ;
} else if ( ggml_nelements ( tensor ) ! = result . n_embd ) {
LOG_ERR ( " %s: direction tensor in %s does not match previous dimensions \n " , __func__ , load_info . fname . c_str ( ) ) ;
result . n_embd = - 1 ;
break ;
}
// extend if necessary - do not store data for layer 0 (it's not used)
result . data . resize ( std : : max ( result . data . size ( ) , static_cast < size_t > ( result . n_embd * layer_idx ) ) , 0.0f ) ;
const float * src = ( const float * ) tensor - > data ;
float * dst = result . data . data ( ) + result . n_embd * ( layer_idx - 1 ) ; // layer 1 at [0]
for ( int j = 0 ; j < result . n_embd ; j + + ) {
dst [ j ] + = src [ j ] * load_info . strength ; // allows multiple directions for same layer in same file
}
}
if ( result . n_embd = = - 1 ) {
LOG_WRN ( " %s: skipping %s due to invalid direction tensors \n " , __func__ , load_info . fname . c_str ( ) ) ;
result . data . clear ( ) ;
}
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
return result ;
}
common_control_vector_data common_control_vector_load ( const std : : vector < common_control_vector_load_info > & load_infos ) {
common_control_vector_data result = { - 1 , { } } ;
for ( const auto & info : load_infos ) {
auto cur = common_control_vector_load_one ( info ) ;
if ( cur . n_embd = = - 1 ) {
result . n_embd = - 1 ;
break ;
}
if ( result . n_embd ! = - 1 & & result . n_embd ! = cur . n_embd ) {
LOG_ERR ( " %s: control vectors in %s does not match previous dimensions \n " , __func__ , info . fname . c_str ( ) ) ;
result . n_embd = - 1 ;
break ;
}
if ( result . n_embd = = - 1 ) {
result = std : : move ( cur ) ;
} else {
result . data . resize ( std : : max ( result . data . size ( ) , cur . data . size ( ) ) , 0.0f ) ; // extend if necessary
for ( size_t i = 0 ; i < cur . data . size ( ) ; i + + ) {
result . data [ i ] + = cur . data [ i ] ;
}
}
}
if ( result . n_embd = = - 1 ) {
LOG_ERR ( " %s: no valid control vector files passed \n " , __func__ ) ;
result . data . clear ( ) ;
}
return result ;
}
ggml_opt_dataset_t common_opt_dataset_init ( struct llama_context * ctx , const std : : vector < llama_token > & tokens , int64_t stride ) {
const int64_t ne_datapoint = llama_n_ctx ( ctx ) ;
const int64_t ndata = ( tokens . size ( ) - ne_datapoint - 1 ) / stride ;
ggml_opt_dataset_t result = ggml_opt_dataset_init (
GGML_TYPE_I32 , GGML_TYPE_I32 , ne_datapoint , ne_datapoint , ndata , /*ndata_shard =*/ 1 ) ;
llama_token * data = ( llama_token * ) ggml_opt_dataset_data ( result ) - > data ;
llama_token * labels = ( llama_token * ) ggml_opt_dataset_labels ( result ) - > data ;
for ( int64_t idata = 0 ; idata < ndata ; + + idata ) {
memcpy ( data + idata * ne_datapoint , tokens . data ( ) + idata * stride + 0 , ne_datapoint * sizeof ( llama_token ) ) ;
memcpy ( labels + idata * ne_datapoint , tokens . data ( ) + idata * stride + 1 , ne_datapoint * sizeof ( llama_token ) ) ;
}
return result ;
}
ggml_opt_optimizer_params common_opt_lr_pars ( void * userdata ) {
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params ( nullptr ) ;
const lr_opt & d = * ( lr_opt * ) userdata ;
result . adamw . alpha = result . sgd . alpha = d . get_lr ( d . epoch ) ;
result . sgd . wd = result . adamw . wd = d . wd ;
return result ;
}
// TODO make all command line args case-insensitive
static inline bool eq_case_insensitive ( char const * a , char const * b ) {
return !
# if defined(_MSC_VER)
_stricmp
# else
strcasecmp
# endif // defined(_MSC_VER)
( a , b ) ;
}
enum ggml_opt_optimizer_type common_opt_get_optimizer ( const char * n ) {
if ( eq_case_insensitive ( " adamw " , n ) ) {
return GGML_OPT_OPTIMIZER_TYPE_ADAMW ;
}
if ( eq_case_insensitive ( " sgd " , n ) ) {
return GGML_OPT_OPTIMIZER_TYPE_SGD ;
}
return GGML_OPT_OPTIMIZER_TYPE_COUNT ;
}
// TODO simplify to use just log and exp
static float const k_log_2 = std : : log ( 2.f ) ;
void lr_opt : : init ( ) {
if ( lr_min > 0 & & lr_min < lr0 ) {
float nhalf = std : : log ( lr0 / lr_min ) / k_log_2 ;
float e = epochs ;
if ( decay_epochs > 0 & & decay_epochs < e ) {
e = decay_epochs ;
} else {
decay_epochs = e ;
}
scale_epoch = nhalf / e ;
}
}
float lr_opt : : get_lr ( float epoch ) const {
float r = lr_min < = 0 ? lr0 :
epoch > = decay_epochs ? lr_min :
lr0 * std : : pow ( 0.5f , epoch * scale_epoch ) ;
LOG_INF ( " epoch %.2g lr=%.2g \n " , epoch , r ) ;
return r ;
}