2024-01-26 13:42:20 +01:00
# pragma once
2024-03-07 11:41:53 +02:00
# include "common.h"
2024-09-15 20:46:12 +03:00
# include "log.h"
# include "llama.h"
2025-05-09 19:29:37 +02:00
# include "arg.h" // common_remote_get_content
2025-04-09 11:11:11 +03:00
# include "base64.hpp"
2025-05-09 19:29:37 +02:00
# include "mtmd.h"
2025-05-28 22:35:22 +02:00
# include "mtmd-helper.h"
2025-05-30 16:25:45 +03:00
# include "chat.h"
2024-01-26 13:42:20 +01:00
2024-09-02 17:11:51 +02:00
// increase max payload length to allow use of larger context size
# define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
2025-07-14 13:14:30 +02:00
// increase backlog size to avoid connection resets for >> 1 slots
# define CPPHTTPLIB_LISTEN_BACKLOG 512
2025-10-30 14:22:23 -04:00
// increase max URI length to handle longer prompts in query string
# define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 32768
2025-02-22 12:46:31 +02:00
// disable Nagle's algorithm
# define CPPHTTPLIB_TCP_NODELAY true
2025-05-30 16:25:45 +03:00
# include <cpp-httplib/httplib.h>
2024-09-02 17:11:51 +02:00
2024-05-08 21:53:08 +02:00
# define JSON_ASSERT GGML_ASSERT
2025-05-30 16:25:45 +03:00
# include <nlohmann/json.hpp>
2024-01-26 13:42:20 +01:00
2024-09-15 20:46:12 +03:00
# include <random>
# include <sstream>
2024-03-07 11:41:53 +02:00
# include <string>
# include <vector>
2024-12-06 11:14:32 +01:00
# include <memory>
2025-05-09 19:29:37 +02:00
# include <cinttypes>
2024-03-07 11:41:53 +02:00
2024-12-14 22:29:45 +00:00
# define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
2024-01-26 13:42:20 +01:00
2024-03-22 13:07:44 +00:00
using json = nlohmann : : ordered_json ;
2024-10-24 21:51:22 +02:00
2025-10-09 18:54:51 +03:00
# define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
# define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
# define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
# define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
2024-10-24 21:51:22 +02:00
# define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
2024-01-26 13:42:20 +01:00
2025-05-09 19:29:37 +02:00
using raw_buffer = std : : vector < uint8_t > ;
2024-03-07 11:41:53 +02:00
template < typename T >
2024-05-08 13:24:14 +02:00
static T json_value ( const json & body , const std : : string & key , const T & default_value ) {
2024-03-07 11:41:53 +02:00
// Fallback null to default value
2024-05-08 13:24:14 +02:00
if ( body . contains ( key ) & & ! body . at ( key ) . is_null ( ) ) {
2024-04-03 20:09:52 +02:00
try {
2024-05-08 13:24:14 +02:00
return body . at ( key ) ;
2025-09-05 14:31:24 -06:00
} catch ( NLOHMANN_JSON_NAMESPACE : : detail : : type_error const & err ) {
LOG_WRN ( " Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s \n " , key . c_str ( ) , json ( default_value ) . type_name ( ) , err . what ( ) ) ;
2024-04-03 20:09:52 +02:00
return default_value ;
}
} else {
return default_value ;
}
2024-03-07 11:41:53 +02:00
}
2024-02-29 21:42:11 +01:00
2024-12-23 12:02:44 +01:00
const static std : : string build_info ( " b " + std : : to_string ( LLAMA_BUILD_NUMBER ) + " - " + LLAMA_COMMIT ) ;
2025-04-02 09:58:34 +02:00
// thin wrapper around common_grammar_trigger with (de)serialization functions
struct server_grammar_trigger {
common_grammar_trigger value ;
server_grammar_trigger ( ) = default ;
server_grammar_trigger ( const common_grammar_trigger & value ) : value ( value ) { }
server_grammar_trigger ( const json & in ) {
value . type = ( common_grammar_trigger_type ) in . at ( " type " ) . get < int > ( ) ;
value . value = in . at ( " value " ) . get < std : : string > ( ) ;
if ( value . type = = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN ) {
value . token = ( llama_token ) in . at ( " token " ) . get < int > ( ) ;
}
}
json to_json ( ) const {
json out {
{ " type " , ( int ) value . type } ,
{ " value " , value . value } ,
} ;
if ( value . type = = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN ) {
out [ " token " ] = ( int ) value . token ;
}
return out ;
}
} ;
2024-01-26 13:42:20 +01:00
//
2024-10-24 21:51:22 +02:00
// tokenizer and input processing utils
2024-01-26 13:42:20 +01:00
//
2024-10-24 21:51:22 +02:00
static bool json_is_array_of_numbers ( const json & data ) {
if ( data . is_array ( ) ) {
for ( const auto & e : data ) {
if ( ! e . is_number_integer ( ) ) {
return false ;
}
}
return true ;
}
return false ;
}
// is array having BOTH numbers & strings?
static bool json_is_array_of_mixed_numbers_strings ( const json & data ) {
bool seen_string = false ;
bool seen_number = false ;
if ( data . is_array ( ) ) {
for ( const auto & e : data ) {
seen_string | = e . is_string ( ) ;
seen_number | = e . is_number_integer ( ) ;
if ( seen_number & & seen_string ) {
return true ;
}
}
}
return false ;
}
2025-08-22 08:10:14 +00:00
// does array have any individual integers/tokens?
static bool json_is_array_and_contains_numbers ( const json & data ) {
if ( data . is_array ( ) ) {
for ( const auto & e : data ) {
if ( e . is_number_integer ( ) ) {
return true ;
}
}
return false ;
}
return false ;
}
2024-12-24 19:39:49 +03:00
// get value by path(key1 / key2)
static json json_get_nested_values ( const std : : vector < std : : string > & paths , const json & js ) {
json result = json : : object ( ) ;
for ( const std : : string & path : paths ) {
json current = js ;
const auto keys = string_split < std : : string > ( path , /*separator*/ ' / ' ) ;
bool valid_path = true ;
for ( const std : : string & k : keys ) {
if ( valid_path & & current . is_object ( ) & & current . contains ( k ) ) {
current = current [ k ] ;
} else {
valid_path = false ;
}
}
if ( valid_path ) {
result [ path ] = current ;
}
}
return result ;
}
2024-10-24 21:51:22 +02:00
/**
* this handles 2 cases :
* - only string , example : " string "
* - mixed string and tokens , example : [ 12 , 34 , " string " , 56 , 78 ]
*/
2025-01-12 11:32:42 +02:00
static llama_tokens tokenize_mixed ( const llama_vocab * vocab , const json & json_prompt , bool add_special , bool parse_special ) {
2024-10-24 21:51:22 +02:00
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
llama_tokens prompt_tokens ;
if ( json_prompt . is_array ( ) ) {
bool first = true ;
for ( const auto & p : json_prompt ) {
if ( p . is_string ( ) ) {
auto s = p . template get < std : : string > ( ) ;
llama_tokens p ;
if ( first ) {
2025-01-12 11:32:42 +02:00
p = common_tokenize ( vocab , s , add_special , parse_special ) ;
2024-10-24 21:51:22 +02:00
first = false ;
} else {
2025-01-12 11:32:42 +02:00
p = common_tokenize ( vocab , s , false , parse_special ) ;
2024-10-24 21:51:22 +02:00
}
prompt_tokens . insert ( prompt_tokens . end ( ) , p . begin ( ) , p . end ( ) ) ;
} else {
if ( first ) {
first = false ;
}
prompt_tokens . push_back ( p . template get < llama_token > ( ) ) ;
}
}
} else {
auto s = json_prompt . template get < std : : string > ( ) ;
2025-01-12 11:32:42 +02:00
prompt_tokens = common_tokenize ( vocab , s , add_special , parse_special ) ;
2024-10-24 21:51:22 +02:00
}
return prompt_tokens ;
}
2024-12-19 15:40:08 +01:00
// return the last index of character that can form a valid string
// if the last character is potentially cut in half, return the index before the cut
// if validate_utf8(text) == text.size(), then the whole text is valid utf8
static size_t validate_utf8 ( const std : : string & text ) {
size_t len = text . size ( ) ;
if ( len = = 0 ) return 0 ;
// Check the last few bytes to see if a multi-byte character is cut off
for ( size_t i = 1 ; i < = 4 & & i < = len ; + + i ) {
unsigned char c = text [ len - i ] ;
// Check for start of a multi-byte sequence from the end
if ( ( c & 0xE0 ) = = 0xC0 ) {
// 2-byte character start: 110xxxxx
// Needs at least 2 bytes
if ( i < 2 ) return len - i ;
} else if ( ( c & 0xF0 ) = = 0xE0 ) {
// 3-byte character start: 1110xxxx
// Needs at least 3 bytes
if ( i < 3 ) return len - i ;
} else if ( ( c & 0xF8 ) = = 0xF0 ) {
// 4-byte character start: 11110xxx
// Needs at least 4 bytes
if ( i < 4 ) return len - i ;
}
}
// If no cut-off multi-byte character is found, return full length
return len ;
}
2024-10-24 21:51:22 +02:00
//
// template utils
//
// format infill task
static llama_tokens format_infill (
2025-01-12 11:32:42 +02:00
const llama_vocab * vocab ,
2024-10-24 21:51:22 +02:00
const json & input_prefix ,
const json & input_suffix ,
const json & input_extra ,
const int n_batch ,
const int n_predict ,
const int n_ctx ,
const bool spm_infill ,
const llama_tokens & tokens_prompt
) {
// TODO: optimize this block by reducing memory allocations and movement
// use FIM repo-level pattern:
// ref: https://arxiv.org/pdf/2409.12186
//
// [FIM_REP]myproject
// [FIM_SEP]filename0
// extra chunk 0
// [FIM_SEP]filename1
// extra chunk 1
// ...
// [FIM_SEP]filename
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
//
llama_tokens extra_tokens ;
extra_tokens . reserve ( n_ctx ) ;
2025-01-12 11:32:42 +02:00
auto tokens_prefix = tokenize_mixed ( vocab , input_prefix , false , false ) ;
auto tokens_suffix = tokenize_mixed ( vocab , input_suffix , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
if ( llama_vocab_fim_rep ( vocab ) ! = LLAMA_TOKEN_NULL ) {
2024-10-24 21:51:22 +02:00
// TODO: make project name an input
2025-01-12 11:32:42 +02:00
static const auto k_fim_repo = common_tokenize ( vocab , " myproject \n " , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
extra_tokens . push_back ( llama_vocab_fim_rep ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_fim_repo . begin ( ) , k_fim_repo . end ( ) ) ;
}
for ( const auto & chunk : input_extra ) {
// { "text": string, "filename": string }
const std : : string text = json_value ( chunk , " text " , std : : string ( ) ) ;
const std : : string filename = json_value ( chunk , " filename " , std : : string ( " tmp " ) ) ;
2025-01-12 11:32:42 +02:00
if ( llama_vocab_fim_sep ( vocab ) ! = LLAMA_TOKEN_NULL ) {
const auto k_fim_file = common_tokenize ( vocab , filename + " \n " , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , llama_vocab_fim_sep ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_fim_file . begin ( ) , k_fim_file . end ( ) ) ;
} else {
// chunk separator in binary form to avoid confusing the AI
static const char k_chunk_prefix_str [ ] = { 0x0a , 0x0a , 0x2d , 0x2d , 0x2d , 0x20 , 0x73 , 0x6e , 0x69 , 0x70 , 0x70 , 0x65 , 0x74 , 0x20 , 0x2d , 0x2d , 0x2d , 0x0a , 0x0a , 0x00 } ;
2025-01-12 11:32:42 +02:00
static const auto k_chunk_prefix_tokens = common_tokenize ( vocab , k_chunk_prefix_str , false , false ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_chunk_prefix_tokens . begin ( ) , k_chunk_prefix_tokens . end ( ) ) ;
}
2025-01-12 11:32:42 +02:00
const auto chunk_tokens = common_tokenize ( vocab , text , false , false ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , chunk_tokens . begin ( ) , chunk_tokens . end ( ) ) ;
}
2025-01-12 11:32:42 +02:00
if ( llama_vocab_fim_sep ( vocab ) ! = LLAMA_TOKEN_NULL ) {
2024-10-24 21:51:22 +02:00
// TODO: current filename
2025-01-12 11:32:42 +02:00
static const auto k_fim_file = common_tokenize ( vocab , " filename \n " , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , llama_vocab_fim_sep ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_fim_file . begin ( ) , k_fim_file . end ( ) ) ;
}
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
2024-10-28 08:49:32 +02:00
const int n_prefix_take = std : : min < int > ( tokens_prefix . size ( ) , 3 * ( n_batch / 4 ) ) ;
const int n_suffix_take = std : : min < int > ( tokens_suffix . size ( ) , std : : max < int > ( 0 , ( n_batch / 4 ) - ( 2 + tokens_prompt . size ( ) ) ) ) ;
SRV_DBG ( " n_prefix_take = %d, n_suffix_take = %d, total = %d \n " , n_prefix_take , n_suffix_take , ( n_prefix_take + n_suffix_take ) ) ;
2024-10-24 21:51:22 +02:00
// fill the rest of the context with extra chunks
const int n_extra_take = std : : min < int > ( std : : max < int > ( 0 , n_ctx - ( n_batch ) - 2 * n_predict ) , extra_tokens . size ( ) ) ;
tokens_prefix . erase ( tokens_prefix . begin ( ) , tokens_prefix . begin ( ) + tokens_prefix . size ( ) - n_prefix_take ) ;
tokens_suffix . resize ( n_suffix_take ) ;
2025-01-12 11:32:42 +02:00
tokens_prefix . insert ( tokens_prefix . begin ( ) , llama_vocab_fim_pre ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
tokens_prefix . insert ( tokens_prefix . end ( ) , tokens_prompt . begin ( ) , tokens_prompt . end ( ) ) ;
2025-01-12 11:32:42 +02:00
tokens_suffix . insert ( tokens_suffix . begin ( ) , llama_vocab_fim_suf ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix ;
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix ;
2025-01-12 11:32:42 +02:00
if ( llama_vocab_get_add_bos ( vocab ) ) {
embd_inp . insert ( embd_inp . begin ( ) , llama_vocab_bos ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
}
SRV_DBG ( " extra: n_ctx = %d, n_extra_take = %d, n_extra = %d \n " , n_ctx , n_extra_take , ( int ) extra_tokens . size ( ) ) ;
// put the extra context before the FIM prefix
embd_inp . insert ( embd_inp . begin ( ) , extra_tokens . end ( ) - n_extra_take , extra_tokens . end ( ) ) ;
embd_inp . insert ( embd_inp . end ( ) , embd_end . begin ( ) , embd_end . end ( ) ) ;
2025-01-12 11:32:42 +02:00
embd_inp . push_back ( llama_vocab_fim_mid ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
return embd_inp ;
}
2024-01-26 13:42:20 +01:00
//
// base64 utils (TODO: move to common in the future)
//
static const std : : string base64_chars =
" ABCDEFGHIJKLMNOPQRSTUVWXYZ "
" abcdefghijklmnopqrstuvwxyz "
" 0123456789+/ " ;
2024-03-07 11:41:53 +02:00
static inline bool is_base64 ( uint8_t c ) {
2024-01-26 13:42:20 +01:00
return ( isalnum ( c ) | | ( c = = ' + ' ) | | ( c = = ' / ' ) ) ;
}
2025-05-09 19:29:37 +02:00
static inline raw_buffer base64_decode ( const std : : string & encoded_string ) {
2024-01-26 13:42:20 +01:00
int i = 0 ;
int j = 0 ;
int in_ = 0 ;
int in_len = encoded_string . size ( ) ;
uint8_t char_array_4 [ 4 ] ;
uint8_t char_array_3 [ 3 ] ;
2025-05-09 19:29:37 +02:00
raw_buffer ret ;
2024-01-26 13:42:20 +01:00
2024-03-07 11:41:53 +02:00
while ( in_len - - & & ( encoded_string [ in_ ] ! = ' = ' ) & & is_base64 ( encoded_string [ in_ ] ) ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ i + + ] = encoded_string [ in_ ] ; in_ + + ;
2024-03-07 11:41:53 +02:00
if ( i = = 4 ) {
for ( i = 0 ; i < 4 ; i + + ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ i ] = base64_chars . find ( char_array_4 [ i ] ) ;
}
char_array_3 [ 0 ] = ( ( char_array_4 [ 0 ] ) < < 2 ) + ( ( char_array_4 [ 1 ] & 0x30 ) > > 4 ) ;
char_array_3 [ 1 ] = ( ( char_array_4 [ 1 ] & 0xf ) < < 4 ) + ( ( char_array_4 [ 2 ] & 0x3c ) > > 2 ) ;
char_array_3 [ 2 ] = ( ( char_array_4 [ 2 ] & 0x3 ) < < 6 ) + char_array_4 [ 3 ] ;
2024-03-07 11:41:53 +02:00
for ( i = 0 ; ( i < 3 ) ; i + + ) {
2024-01-26 13:42:20 +01:00
ret . push_back ( char_array_3 [ i ] ) ;
}
2024-03-07 11:41:53 +02:00
2024-01-26 13:42:20 +01:00
i = 0 ;
}
}
2024-03-07 11:41:53 +02:00
if ( i ) {
for ( j = i ; j < 4 ; j + + ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ j ] = 0 ;
}
2024-03-07 11:41:53 +02:00
for ( j = 0 ; j < 4 ; j + + ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ j ] = base64_chars . find ( char_array_4 [ j ] ) ;
}
char_array_3 [ 0 ] = ( ( char_array_4 [ 0 ] ) < < 2 ) + ( ( char_array_4 [ 1 ] & 0x30 ) > > 4 ) ;
char_array_3 [ 1 ] = ( ( char_array_4 [ 1 ] & 0xf ) < < 4 ) + ( ( char_array_4 [ 2 ] & 0x3c ) > > 2 ) ;
char_array_3 [ 2 ] = ( ( char_array_4 [ 2 ] & 0x3 ) < < 6 ) + char_array_4 [ 3 ] ;
2024-03-07 11:41:53 +02:00
for ( j = 0 ; j < i - 1 ; j + + ) {
2024-01-26 13:42:20 +01:00
ret . push_back ( char_array_3 [ j ] ) ;
}
}
return ret ;
}
//
// random string / id
//
2024-03-07 11:41:53 +02:00
static std : : string random_string ( ) {
2024-01-26 13:42:20 +01:00
static const std : : string str ( " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " ) ;
std : : random_device rd ;
std : : mt19937 generator ( rd ( ) ) ;
std : : string result ( 32 , ' ' ) ;
for ( int i = 0 ; i < 32 ; + + i ) {
result [ i ] = str [ generator ( ) % str . size ( ) ] ;
}
return result ;
}
2024-03-07 11:41:53 +02:00
static std : : string gen_chatcmplid ( ) {
2024-09-15 20:46:12 +03:00
return " chatcmpl- " + random_string ( ) ;
2024-01-26 13:42:20 +01:00
}
2024-02-29 21:42:11 +01:00
2025-03-10 09:45:29 +00:00
static std : : string gen_tool_call_id ( ) {
return random_string ( ) ;
}
2024-02-29 21:42:11 +01:00
//
// other common utils
//
// TODO: reuse llama_detokenize
template < class Iter >
2024-03-07 11:41:53 +02:00
static std : : string tokens_to_str ( llama_context * ctx , Iter begin , Iter end ) {
2024-02-29 21:42:11 +01:00
std : : string ret ;
2024-03-07 11:41:53 +02:00
for ( ; begin ! = end ; + + begin ) {
2024-10-10 22:57:42 +02:00
ret + = common_token_to_piece ( ctx , * begin ) ;
2024-02-29 21:42:11 +01:00
}
2024-03-07 11:41:53 +02:00
2024-02-29 21:42:11 +01:00
return ret ;
}
// format incomplete utf-8 multibyte character for output
2024-03-07 11:41:53 +02:00
static std : : string tokens_to_output_formatted_string ( const llama_context * ctx , const llama_token token ) {
2025-01-06 10:52:15 +02:00
std : : string out = token = = LLAMA_TOKEN_NULL ? " " : common_token_to_piece ( ctx , token ) ;
2024-03-07 11:41:53 +02:00
2024-02-29 21:42:11 +01:00
// if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token)
2024-03-07 11:41:53 +02:00
if ( out . size ( ) = = 1 & & ( out [ 0 ] & 0x80 ) = = 0x80 ) {
2024-02-29 21:42:11 +01:00
std : : stringstream ss ;
ss < < std : : hex < < ( out [ 0 ] & 0xff ) ;
std : : string res ( ss . str ( ) ) ;
out = " byte: \\ x " + res ;
}
2024-03-07 11:41:53 +02:00
2024-02-29 21:42:11 +01:00
return out ;
}
2025-09-20 07:56:30 +02:00
static bool server_sent_event ( httplib : : DataSink & sink , const json & data ) {
2024-09-02 17:11:51 +02:00
const std : : string str =
2025-09-20 07:56:30 +02:00
" data: " +
2024-09-02 17:11:51 +02:00
data . dump ( - 1 , ' ' , false , json : : error_handler_t : : replace ) +
2024-12-06 11:14:32 +01:00
" \n \n " ; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
2024-09-02 17:11:51 +02:00
2024-09-15 20:46:12 +03:00
LOG_DBG ( " data stream, to_send: %s " , str . c_str ( ) ) ;
2024-09-02 17:11:51 +02:00
return sink . write ( str . c_str ( ) , str . size ( ) ) ;
}
2024-03-07 11:41:53 +02:00
//
// OAI utils
//
2025-05-23 11:03:47 +02:00
// used by /completions endpoint
2024-12-31 12:34:13 +01:00
static json oaicompat_completion_params_parse ( const json & body ) {
json llama_params ;
if ( ! body . contains ( " prompt " ) ) {
throw std : : runtime_error ( " \" prompt \" is required " ) ;
}
// Handle "stop" field
if ( body . contains ( " stop " ) & & body . at ( " stop " ) . is_string ( ) ) {
llama_params [ " stop " ] = json : : array ( { body . at ( " stop " ) . get < std : : string > ( ) } ) ;
} else {
llama_params [ " stop " ] = json_value ( body , " stop " , json : : array ( ) ) ;
}
// Handle "n" field
int n_choices = json_value ( body , " n " , 1 ) ;
if ( n_choices ! = 1 ) {
throw std : : runtime_error ( " Only one completion choice is allowed " ) ;
}
2025-02-25 11:52:52 +00:00
// Handle "echo" field
if ( json_value ( body , " echo " , false ) ) {
throw std : : runtime_error ( " Only no echo is supported " ) ;
}
2024-12-31 12:34:13 +01:00
// Params supported by OAI but unsupported by llama.cpp
2025-02-25 11:52:52 +00:00
static const std : : vector < std : : string > unsupported_params { " best_of " , " suffix " } ;
2024-12-31 12:34:13 +01:00
for ( const auto & param : unsupported_params ) {
if ( body . contains ( param ) ) {
throw std : : runtime_error ( " Unsupported param: " + param ) ;
}
}
// Copy remaining properties to llama_params
for ( const auto & item : body . items ( ) ) {
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
if ( ! llama_params . contains ( item . key ( ) ) | | item . key ( ) = = " n_predict " ) {
llama_params [ item . key ( ) ] = item . value ( ) ;
}
}
return llama_params ;
}
2025-05-23 11:03:47 +02:00
struct oaicompat_parser_options {
bool use_jinja ;
bool prefill_assistant ;
common_reasoning_format reasoning_format ;
2025-06-29 20:02:53 +02:00
std : : map < std : : string , std : : string > chat_template_kwargs ;
2025-05-23 11:03:47 +02:00
common_chat_templates * tmpls ;
bool allow_image ;
bool allow_audio ;
2025-05-26 00:30:51 +01:00
bool enable_thinking = true ;
2025-05-23 11:03:47 +02:00
} ;
// used by /chat/completions endpoint
static json oaicompat_chat_params_parse (
2025-05-28 22:33:54 +08:00
json & body , /* openai api json semantics */
2025-05-23 11:03:47 +02:00
const oaicompat_parser_options & opt ,
2025-05-09 19:29:37 +02:00
std : : vector < raw_buffer > & out_files )
2025-01-21 13:18:51 +00:00
{
2024-03-07 11:41:53 +02:00
json llama_params ;
2025-01-21 13:18:51 +00:00
auto tools = json_value ( body , " tools " , json ( ) ) ;
2025-05-25 01:48:08 +01:00
auto has_tools = tools . is_array ( ) & & ! tools . empty ( ) ;
2025-01-30 19:13:58 +00:00
auto stream = json_value ( body , " stream " , false ) ;
2025-05-25 01:48:08 +01:00
auto tool_choice = json_value ( body , " tool_choice " , std : : string ( " auto " ) ) ;
2025-01-21 13:18:51 +00:00
2025-05-25 01:48:08 +01:00
if ( ! opt . use_jinja ) {
if ( has_tools ) {
2025-01-21 13:18:51 +00:00
throw std : : runtime_error ( " tools param requires --jinja flag " ) ;
}
2025-05-25 01:48:08 +01:00
if ( tool_choice ! = " auto " ) {
throw std : : runtime_error ( " tool_choice param requires --jinja flag " ) ;
2025-01-30 19:13:58 +00:00
}
}
json-schema-to-grammar improvements (+ added to server) (#5978)
* json: fix arrays (disallow `[,1]`)
* json: support tuple types (`[number, string]`)
* json: support additionalProperties (`{[k: string]: [string,number][]}`)
* json: support required / optional properties
* json: add support for pattern
* json: resolve $ref (and support https schema urls)
* json: fix $ref resolution
* join: support union types (mostly for nullable types I think)
* json: support allOf + nested anyOf
* json: support any (`{}` or `{type: object}`)
* json: fix merge
* json: temp fix for escapes
* json: spaces in output and unrestricted output spaces
* json: add typings
* json:fix typo
* Create ts-type-to-grammar.sh
* json: fix _format_literal (json.dumps already escapes quotes)
* json: merge lit sequences and handle negatives
{"type": "string", "pattern": "^({\"question\": \"[^\"]+\", \"response\": \"[^\"]+\"}\\n)+$"}
* json: handle pattern repetitions
* Update json-schema-to-grammar.mjs
* Create regex-to-grammar.py
* json: extract repeated regexp patterns to subrule
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* json: handle schema from pydantic Optional fields
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* Update ts-type-to-grammar.sh
* Update ts-type-to-grammar.sh
* json: simplify nullable fields handling
* json: accept duplicate identical rules
* json: revert space to 1 at most
* json: reuse regexp pattern subrules
* json: handle uuid string format
* json: fix literal escapes
* json: add --allow-fetch
* json: simplify range escapes
* json: support negative ranges in patterns
* Delete commit.txt
* json: custom regex parser, adds dot support & JS-portable
* json: rm trailing spaces
* Update json-schema-to-grammar.mjs
* json: updated server & chat `( cd examples/server && ./deps.sh )`
* json: port fixes from mjs to python
* Update ts-type-to-grammar.sh
* json: support prefixItems alongside array items
* json: add date format + fix uuid
* json: add date, time, date-time formats
* json: preserve order of props from TS defs
* json: port schema converter to C++, wire in ./server
* json: nits
* Update json-schema-to-grammar.cpp
* Update json-schema-to-grammar.cpp
* Update json-schema-to-grammar.cpp
* json: fix mjs implementation + align outputs
* Update json-schema-to-grammar.mjs.hpp
* json: test C++, JS & Python versions
* json: nits + regen deps
* json: cleanup test
* json: revert from c++17 to 11
* json: nit fixes
* json: dirty include for test
* json: fix zig build
* json: pass static command to std::system in tests (fixed temp files)
* json: fix top-level $refs
* json: don't use c++20 designated initializers
* nit
* json: basic support for reserved names `{number:{number:{root:number}}}`
* Revamp test cmake to allow args (WORKING_DIRECTORY needed for JSON test)
* json: re-ran server deps.sh
* json: simplify test
* json: support mix of additional props & required/optional
* json: add tests for some expected failures
* json: fix type=const in c++, add failure expectations for non-str const&enum
* json: test (& simplify output of) empty schema
* json: check parsing in test + fix value & string refs
* json: add server tests for OAI JSON response_format
* json: test/fix top-level anyOf
* json: improve grammar parsing failures
* json: test/fix additional props corner cases
* json: fix string patterns (was missing quotes)
* json: ws nit
* json: fix json handling in server when there's no response_format
* json: catch schema conversion errors in server
* json: don't complain about unknown format type in server if unset
* json: cleaner build of test
* json: create examples/json-schema-pydantic-example.py
* json: fix date pattern
* json: move json.hpp & json-schema-to-grammar.{cpp,h} to common
* json: indent 4 spaces
* json: fix naming of top-level c++ function (+ drop unused one)
* json: avoid using namespace std
* json: fix zig build
* Update server.feature
* json: iostream -> fprintf
* json: space before & refs for consistency
* json: nits
2024-03-21 11:50:43 +00:00
2024-03-25 09:42:17 +01:00
// Handle "stop" field
2024-05-08 21:53:08 +02:00
if ( body . contains ( " stop " ) & & body . at ( " stop " ) . is_string ( ) ) {
llama_params [ " stop " ] = json : : array ( { body . at ( " stop " ) . get < std : : string > ( ) } ) ;
2024-03-07 11:41:53 +02:00
} else {
llama_params [ " stop " ] = json_value ( body , " stop " , json : : array ( ) ) ;
}
2025-02-18 18:03:23 +00:00
auto json_schema = json_value ( body , " json_schema " , json ( ) ) ;
auto grammar = json_value ( body , " grammar " , std : : string ( ) ) ;
if ( ! json_schema . is_null ( ) & & ! grammar . empty ( ) ) {
throw std : : runtime_error ( " Cannot use both json_schema and grammar " ) ;
}
2024-03-25 09:42:17 +01:00
// Handle "response_format" field
if ( body . contains ( " response_format " ) ) {
json response_format = json_value ( body , " response_format " , json : : object ( ) ) ;
std : : string response_type = json_value ( response_format , " type " , std : : string ( ) ) ;
if ( response_type = = " json_object " ) {
2025-02-18 18:03:23 +00:00
json_schema = json_value ( response_format , " schema " , json : : object ( ) ) ;
2024-09-18 01:50:34 -05:00
} else if ( response_type = = " json_schema " ) {
2025-03-04 06:24:07 +00:00
auto schema_wrapper = json_value ( response_format , " json_schema " , json : : object ( ) ) ;
json_schema = json_value ( schema_wrapper , " schema " , json : : object ( ) ) ;
2024-03-25 09:42:17 +01:00
} else if ( ! response_type . empty ( ) & & response_type ! = " text " ) {
throw std : : runtime_error ( " response_format type must be one of \" text \" or \" json_object \" , but got: " + response_type ) ;
}
}
2025-05-09 19:29:37 +02:00
// get input files
if ( ! body . contains ( " messages " ) ) {
throw std : : runtime_error ( " 'messages' is required " ) ;
}
2025-05-28 22:33:54 +08:00
json & messages = body . at ( " messages " ) ;
2025-05-09 19:29:37 +02:00
if ( ! messages . is_array ( ) ) {
throw std : : runtime_error ( " Expected 'messages' to be an array " ) ;
}
for ( auto & msg : messages ) {
2025-05-15 08:40:58 +02:00
std : : string role = json_value ( msg , " role " , std : : string ( ) ) ;
if ( role ! = " assistant " & & ! msg . contains ( " content " ) ) {
throw std : : runtime_error ( " All non-assistant messages must contain 'content' " ) ;
}
if ( role = = " assistant " ) {
if ( ! msg . contains ( " content " ) & & ! msg . contains ( " tool_calls " ) ) {
throw std : : runtime_error ( " Assistant message must contain either 'content' or 'tool_calls'! " ) ;
}
if ( ! msg . contains ( " content " ) ) {
continue ; // avoid errors with no content
}
}
2025-05-09 19:29:37 +02:00
json & content = msg . at ( " content " ) ;
2025-05-12 18:56:42 +07:00
if ( content . is_string ( ) | | content . is_null ( ) ) {
2025-05-09 19:29:37 +02:00
continue ;
}
if ( ! content . is_array ( ) ) {
throw std : : runtime_error ( " Expected 'content' to be a string or an array " ) ;
}
for ( auto & p : content ) {
std : : string type = json_value ( p , " type " , std : : string ( ) ) ;
if ( type = = " image_url " ) {
2025-05-23 11:03:47 +02:00
if ( ! opt . allow_image ) {
throw std : : runtime_error ( " image input is not supported - hint: if this is unexpected, you may need to provide the mmproj " ) ;
2025-05-09 19:29:37 +02:00
}
2025-05-23 11:03:47 +02:00
json image_url = json_value ( p , " image_url " , json : : object ( ) ) ;
2025-05-09 19:29:37 +02:00
std : : string url = json_value ( image_url , " url " , std : : string ( ) ) ;
if ( string_starts_with ( url , " http " ) ) {
// download remote image
// TODO @ngxson : maybe make these params configurable
common_remote_params params ;
params . headers . push_back ( " User-Agent: llama.cpp/ " + build_info ) ;
params . max_size = 1024 * 1024 * 10 ; // 10MB
params . timeout = 10 ; // seconds
SRV_INF ( " downloading image from '%s' \n " , url . c_str ( ) ) ;
auto res = common_remote_get_content ( url , params ) ;
if ( 200 < = res . first & & res . first < 300 ) {
SRV_INF ( " downloaded %ld bytes \n " , res . second . size ( ) ) ;
raw_buffer data ;
data . insert ( data . end ( ) , res . second . begin ( ) , res . second . end ( ) ) ;
out_files . push_back ( data ) ;
} else {
throw std : : runtime_error ( " Failed to download image " ) ;
}
} else {
// try to decode base64 image
std : : vector < std : : string > parts = string_split < std : : string > ( url , /*separator*/ ' , ' ) ;
if ( parts . size ( ) ! = 2 ) {
throw std : : runtime_error ( " Invalid image_url.url value " ) ;
} else if ( ! string_starts_with ( parts [ 0 ] , " data:image/ " ) ) {
throw std : : runtime_error ( " Invalid image_url.url format: " + parts [ 0 ] ) ;
} else if ( ! string_ends_with ( parts [ 0 ] , " base64 " ) ) {
throw std : : runtime_error ( " image_url.url must be base64 encoded " ) ;
} else {
auto base64_data = parts [ 1 ] ;
auto decoded_data = base64_decode ( base64_data ) ;
out_files . push_back ( decoded_data ) ;
}
}
// replace this chunk with a marker
p [ " type " ] = " text " ;
2025-05-22 20:42:48 +02:00
p [ " text " ] = mtmd_default_marker ( ) ;
2025-05-09 19:29:37 +02:00
p . erase ( " image_url " ) ;
2025-05-23 11:03:47 +02:00
} else if ( type = = " input_audio " ) {
if ( ! opt . allow_audio ) {
throw std : : runtime_error ( " audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj " ) ;
}
json input_audio = json_value ( p , " input_audio " , json : : object ( ) ) ;
std : : string data = json_value ( input_audio , " data " , std : : string ( ) ) ;
std : : string format = json_value ( input_audio , " format " , std : : string ( ) ) ;
// while we also support flac, we don't allow it here so we matches the OAI spec
if ( format ! = " wav " & & format ! = " mp3 " ) {
throw std : : runtime_error ( " input_audio.format must be either 'wav' or 'mp3' " ) ;
}
auto decoded_data = base64_decode ( data ) ; // expected to be base64 encoded
out_files . push_back ( decoded_data ) ;
// replace this chunk with a marker
p [ " type " ] = " text " ;
p [ " text " ] = mtmd_default_marker ( ) ;
p . erase ( " input_audio " ) ;
} else if ( type ! = " text " ) {
throw std : : runtime_error ( " unsupported content[].type " ) ;
2025-05-09 19:29:37 +02:00
}
}
}
2025-02-18 18:03:23 +00:00
common_chat_templates_inputs inputs ;
2025-05-09 19:29:37 +02:00
inputs . messages = common_chat_msgs_parse_oaicompat ( messages ) ;
2025-02-18 18:03:23 +00:00
inputs . tools = common_chat_tools_parse_oaicompat ( tools ) ;
2025-05-25 01:48:08 +01:00
inputs . tool_choice = common_chat_tool_choice_parse_oaicompat ( tool_choice ) ;
2025-02-18 18:03:23 +00:00
inputs . json_schema = json_schema . is_null ( ) ? " " : json_schema . dump ( ) ;
inputs . grammar = grammar ;
2025-05-23 11:03:47 +02:00
inputs . use_jinja = opt . use_jinja ;
2025-02-18 18:03:23 +00:00
inputs . parallel_tool_calls = json_value ( body , " parallel_tool_calls " , false ) ;
2025-05-25 10:45:49 +01:00
inputs . add_generation_prompt = json_value ( body , " add_generation_prompt " , true ) ;
2025-05-25 01:48:08 +01:00
inputs . reasoning_format = opt . reasoning_format ;
2025-05-26 00:30:51 +01:00
inputs . enable_thinking = opt . enable_thinking ;
2025-05-26 08:03:57 -07:00
if ( ! inputs . tools . empty ( ) & & inputs . tool_choice ! = COMMON_CHAT_TOOL_CHOICE_NONE ) {
if ( body . contains ( " grammar " ) ) {
throw std : : runtime_error ( " Cannot use custom grammar constraints with tools. " ) ;
}
llama_params [ " parse_tool_calls " ] = true ;
2025-02-18 18:03:23 +00:00
}
2025-06-29 20:02:53 +02:00
// merge the template args provided from command line with the args provided in the user request
auto chat_template_kwargs_object = json_value ( body , " chat_template_kwargs " , json : : object ( ) ) ;
inputs . chat_template_kwargs = opt . chat_template_kwargs ;
for ( const auto & item : chat_template_kwargs_object . items ( ) ) {
inputs . chat_template_kwargs [ item . key ( ) ] = item . value ( ) . dump ( ) ;
}
2025-09-05 14:31:24 -06:00
// parse the "enable_thinking" kwarg to override the default value
auto enable_thinking_kwarg = json_value ( inputs . chat_template_kwargs , " enable_thinking " , std : : string ( " " ) ) ;
if ( enable_thinking_kwarg = = " true " ) {
inputs . enable_thinking = true ;
} else if ( enable_thinking_kwarg = = " false " ) {
inputs . enable_thinking = false ;
} else if ( ! enable_thinking_kwarg . empty ( ) & & enable_thinking_kwarg [ 0 ] = = ' " ' ) {
throw std : : runtime_error ( " invalid type for \" enable_thinking \" (expected boolean, got string) " ) ;
}
2025-04-29 20:33:10 +02:00
// if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models
2025-05-23 11:03:47 +02:00
bool prefill_assistant_message = ! inputs . messages . empty ( ) & & inputs . messages . back ( ) . role = = " assistant " & & opt . prefill_assistant ;
2025-04-29 20:33:10 +02:00
common_chat_msg last_message ;
if ( prefill_assistant_message ) {
last_message = inputs . messages . back ( ) ;
inputs . messages . pop_back ( ) ;
/* sanity check, max one assistant message at the end of the list */
if ( ! inputs . messages . empty ( ) & & inputs . messages . back ( ) . role = = " assistant " ) {
throw std : : runtime_error ( " Cannot have 2 or more assistant messages at the end of the list. " ) ;
}
2025-05-25 01:48:08 +01:00
/* TODO: test this properly */
inputs . reasoning_format = COMMON_REASONING_FORMAT_NONE ;
2025-06-29 20:02:53 +02:00
2025-09-05 14:31:24 -06:00
if ( inputs . enable_thinking ) {
2025-06-29 20:02:53 +02:00
throw std : : runtime_error ( " Assistant response prefill is incompatible with enable_thinking. " ) ;
}
2025-04-29 20:33:10 +02:00
inputs . add_generation_prompt = true ;
}
2025-01-21 13:18:51 +00:00
// Apply chat template to the list of messages
2025-05-23 11:03:47 +02:00
auto chat_params = common_chat_templates_apply ( opt . tmpls , inputs ) ;
2025-02-18 18:03:23 +00:00
2025-04-29 20:33:10 +02:00
/* Append assistant prefilled message */
if ( prefill_assistant_message ) {
2025-07-05 09:17:14 +02:00
if ( ! last_message . content_parts . empty ( ) ) {
for ( auto & p : last_message . content_parts ) {
chat_params . prompt + = p . text ;
}
} else {
chat_params . prompt + = last_message . content ;
}
2025-04-29 20:33:10 +02:00
}
2025-02-18 18:03:23 +00:00
llama_params [ " chat_format " ] = static_cast < int > ( chat_params . format ) ;
llama_params [ " prompt " ] = chat_params . prompt ;
2025-03-14 11:21:17 +01:00
if ( ! chat_params . grammar . empty ( ) ) {
llama_params [ " grammar " ] = chat_params . grammar ;
}
2025-02-18 18:03:23 +00:00
llama_params [ " grammar_lazy " ] = chat_params . grammar_lazy ;
auto grammar_triggers = json : : array ( ) ;
for ( const auto & trigger : chat_params . grammar_triggers ) {
2025-04-02 09:58:34 +02:00
server_grammar_trigger ct ( trigger ) ;
grammar_triggers . push_back ( ct . to_json ( ) ) ;
2025-02-18 18:03:23 +00:00
}
llama_params [ " grammar_triggers " ] = grammar_triggers ;
llama_params [ " preserved_tokens " ] = chat_params . preserved_tokens ;
2025-05-25 01:48:08 +01:00
llama_params [ " thinking_forced_open " ] = chat_params . thinking_forced_open ;
2025-02-18 18:03:23 +00:00
for ( const auto & stop : chat_params . additional_stops ) {
llama_params [ " stop " ] . push_back ( stop ) ;
2025-01-21 13:18:51 +00:00
}
2024-03-25 09:42:17 +01:00
// Handle "n" field
int n_choices = json_value ( body , " n " , 1 ) ;
if ( n_choices ! = 1 ) {
throw std : : runtime_error ( " Only one completion choice is allowed " ) ;
}
// Handle "logprobs" field
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
2024-10-14 15:04:36 +08:00
if ( json_value ( body , " logprobs " , false ) ) {
2025-05-25 01:48:08 +01:00
if ( has_tools & & stream ) {
throw std : : runtime_error ( " logprobs is not supported with tools + stream " ) ;
}
2024-03-25 09:42:17 +01:00
llama_params [ " n_probs " ] = json_value ( body , " top_logprobs " , 20 ) ;
2024-10-14 15:04:36 +08:00
} else if ( body . contains ( " top_logprobs " ) & & ! body . at ( " top_logprobs " ) . is_null ( ) ) {
2024-03-25 09:42:17 +01:00
throw std : : runtime_error ( " top_logprobs requires logprobs to be set to true " ) ;
}
// Copy remaining properties to llama_params
2024-10-29 10:42:05 +02:00
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
2024-03-25 09:42:17 +01:00
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
for ( const auto & item : body . items ( ) ) {
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
if ( ! llama_params . contains ( item . key ( ) ) | | item . key ( ) = = " n_predict " ) {
llama_params [ item . key ( ) ] = item . value ( ) ;
}
}
2024-03-07 11:41:53 +02:00
return llama_params ;
}
2024-12-24 21:33:04 +01:00
static json format_embeddings_response_oaicompat ( const json & request , const json & embeddings , bool use_base64 = false ) {
2024-03-13 11:39:11 +01:00
json data = json : : array ( ) ;
2024-12-17 16:00:24 +00:00
int32_t n_tokens = 0 ;
2024-03-13 11:39:11 +01:00
int i = 0 ;
2024-09-15 20:46:12 +03:00
for ( const auto & elem : embeddings ) {
2024-12-24 21:33:04 +01:00
json embedding_obj ;
if ( use_base64 ) {
const auto & vec = json_value ( elem , " embedding " , json : : array ( ) ) . get < std : : vector < float > > ( ) ;
const char * data_ptr = reinterpret_cast < const char * > ( vec . data ( ) ) ;
size_t data_size = vec . size ( ) * sizeof ( float ) ;
embedding_obj = {
{ " embedding " , base64 : : encode ( data_ptr , data_size ) } ,
{ " index " , i + + } ,
{ " object " , " embedding " } ,
{ " encoding_format " , " base64 " }
} ;
} else {
embedding_obj = {
{ " embedding " , json_value ( elem , " embedding " , json : : array ( ) ) } ,
{ " index " , i + + } ,
{ " object " , " embedding " }
} ;
}
data . push_back ( embedding_obj ) ;
2024-12-17 16:00:24 +00:00
n_tokens + = json_value ( elem , " tokens_evaluated " , 0 ) ;
2024-03-13 11:39:11 +01:00
}
2024-03-07 11:41:53 +02:00
json res = json {
{ " model " , json_value ( request , " model " , std : : string ( DEFAULT_OAICOMPAT_MODEL ) ) } ,
{ " object " , " list " } ,
2024-12-17 16:00:24 +00:00
{ " usage " , json {
{ " prompt_tokens " , n_tokens } ,
{ " total_tokens " , n_tokens }
2024-03-07 11:41:53 +02:00
} } ,
2024-03-13 11:39:11 +01:00
{ " data " , data }
2024-03-07 11:41:53 +02:00
} ;
return res ;
}
2025-02-18 14:21:41 +01:00
static json format_response_rerank (
const json & request ,
const json & ranks ,
bool is_tei_format ,
server / ranking : add sorting and management of top_n (#16403)
* server / ranking : add sorting and management of top_n
* Make the retro compatible if no top_n will return
all results
here is a script to make some test
```script
URL=${1:-http://127.0.0.1:8181}
curl "$URL/v1/rerank" -H "Content-Type: application/json" \
-d '{ "model": "M", "query": "What is the recipe to make bread ?",
"return_text" : true,
"texts" : true,
"top_n": 6,
"documents": [
"voici la recette pour faire du pain, il faut de la farine de l eau et du levain et du sel",
"it is a bear",
"bread recipe : floor, water, yest, salt",
"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
"here is the ingedients to bake bread : 500g floor, 350g water, 120g fresh refresh yest, 15g salt",
"recipe to make cookies : floor, eggs, water, chocolat",
"here is the recipe to make bread : 500g floor, 350g water, 120g fresh refresh yest, 15g salt",
"il fait tres beau aujourd hui",
"je n ai pas faim, je ne veux pas manger",
"je suis a paris"
] }' | jq
```
* use resize() instead for(...)
* simplify top_n init since no need to return error
result to test :
./tests.sh unit/test_rerank.py -v -x
==================================================== test session starts =====================================================
platform linux -- Python 3.12.3, pytest-8.3.5, pluggy-1.6.0 -- /home/yann/dev/yann/llama.cpp/tools/server/tests/test/bin/python3
cachedir: .pytest_cache
rootdir: /home/yann/dev/yann/llama.cpp/tools/server/tests
configfile: pytest.ini
plugins: anyio-4.11.0
collected 8 items
unit/test_rerank.py::test_rerank PASSED [ 12%]
unit/test_rerank.py::test_rerank_tei_format PASSED [ 25%]
unit/test_rerank.py::test_invalid_rerank_req[documents0] PASSED [ 37%]
unit/test_rerank.py::test_invalid_rerank_req[None] PASSED [ 50%]
unit/test_rerank.py::test_invalid_rerank_req[123] PASSED [ 62%]
unit/test_rerank.py::test_invalid_rerank_req[documents3] PASSED [ 75%]
unit/test_rerank.py::test_rerank_usage[Machine learning is-A machine-Learning is-19] PASSED [ 87%]
unit/test_rerank.py::test_rerank_usage[Which city?-Machine learning is -Paris, capitale de la-26] PASSED [100%]
===================================================== 8 passed in 4.31s ======================================================
* add rerank top_n unit test
here is the result :
./tests.sh unit/test_rerank.py -v -x
=================================================================== test session starts ===================================================================
platform linux -- Python 3.12.3, pytest-8.3.5, pluggy-1.6.0 -- /home/yann/dev/yann/llama.cpp/tools/server/tests/test/bin/python3
cachedir: .pytest_cache
rootdir: /home/yann/dev/yann/llama.cpp/tools/server/tests
configfile: pytest.ini
plugins: anyio-4.11.0
collected 16 items
unit/test_rerank.py::test_rerank PASSED [ 6%]
unit/test_rerank.py::test_rerank_tei_format PASSED [ 12%]
unit/test_rerank.py::test_invalid_rerank_req[documents0] PASSED [ 18%]
unit/test_rerank.py::test_invalid_rerank_req[None] PASSED [ 25%]
unit/test_rerank.py::test_invalid_rerank_req[123] PASSED [ 31%]
unit/test_rerank.py::test_invalid_rerank_req[documents3] PASSED [ 37%]
unit/test_rerank.py::test_rerank_usage[Machine learning is-A machine-Learning is-19] PASSED [ 43%]
unit/test_rerank.py::test_rerank_usage[Which city?-Machine learning is -Paris, capitale de la-26] PASSED [ 50%]
unit/test_rerank.py::test_rerank_top_n[None-4] PASSED [ 56%]
unit/test_rerank.py::test_rerank_top_n[2-2] PASSED [ 62%]
unit/test_rerank.py::test_rerank_top_n[4-4] PASSED [ 68%]
unit/test_rerank.py::test_rerank_top_n[99-4] PASSED [ 75%]
unit/test_rerank.py::test_rerank_tei_top_n[None-4] PASSED [ 81%]
unit/test_rerank.py::test_rerank_tei_top_n[2-2] PASSED [ 87%]
unit/test_rerank.py::test_rerank_tei_top_n[4-4] PASSED [ 93%]
unit/test_rerank.py::test_rerank_tei_top_n[99-4] PASSED [100%]
=================================================================== 16 passed in 8.84s ===================================================================
* editor config check fix
2025-10-11 21:39:04 +08:00
std : : vector < std : : string > & texts ,
int top_n ) {
int32_t n_tokens = 0 ;
bool return_text = is_tei_format & & json_value ( request , " return_text " , false ) ;
std : : vector < json > elements ; // Temporary vector to hold unsorted elements
std : : string score_label = is_tei_format ? " score " : " relevance_score " ;
for ( const auto & rank : ranks ) {
int index = json_value ( rank , " index " , 0 ) ;
json elem = json {
{ " index " , index } ,
{ score_label , json_value ( rank , " score " , 0.0 ) } ,
2025-02-18 14:21:41 +01:00
} ;
server / ranking : add sorting and management of top_n (#16403)
* server / ranking : add sorting and management of top_n
* Make the retro compatible if no top_n will return
all results
here is a script to make some test
```script
URL=${1:-http://127.0.0.1:8181}
curl "$URL/v1/rerank" -H "Content-Type: application/json" \
-d '{ "model": "M", "query": "What is the recipe to make bread ?",
"return_text" : true,
"texts" : true,
"top_n": 6,
"documents": [
"voici la recette pour faire du pain, il faut de la farine de l eau et du levain et du sel",
"it is a bear",
"bread recipe : floor, water, yest, salt",
"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
"here is the ingedients to bake bread : 500g floor, 350g water, 120g fresh refresh yest, 15g salt",
"recipe to make cookies : floor, eggs, water, chocolat",
"here is the recipe to make bread : 500g floor, 350g water, 120g fresh refresh yest, 15g salt",
"il fait tres beau aujourd hui",
"je n ai pas faim, je ne veux pas manger",
"je suis a paris"
] }' | jq
```
* use resize() instead for(...)
* simplify top_n init since no need to return error
result to test :
./tests.sh unit/test_rerank.py -v -x
==================================================== test session starts =====================================================
platform linux -- Python 3.12.3, pytest-8.3.5, pluggy-1.6.0 -- /home/yann/dev/yann/llama.cpp/tools/server/tests/test/bin/python3
cachedir: .pytest_cache
rootdir: /home/yann/dev/yann/llama.cpp/tools/server/tests
configfile: pytest.ini
plugins: anyio-4.11.0
collected 8 items
unit/test_rerank.py::test_rerank PASSED [ 12%]
unit/test_rerank.py::test_rerank_tei_format PASSED [ 25%]
unit/test_rerank.py::test_invalid_rerank_req[documents0] PASSED [ 37%]
unit/test_rerank.py::test_invalid_rerank_req[None] PASSED [ 50%]
unit/test_rerank.py::test_invalid_rerank_req[123] PASSED [ 62%]
unit/test_rerank.py::test_invalid_rerank_req[documents3] PASSED [ 75%]
unit/test_rerank.py::test_rerank_usage[Machine learning is-A machine-Learning is-19] PASSED [ 87%]
unit/test_rerank.py::test_rerank_usage[Which city?-Machine learning is -Paris, capitale de la-26] PASSED [100%]
===================================================== 8 passed in 4.31s ======================================================
* add rerank top_n unit test
here is the result :
./tests.sh unit/test_rerank.py -v -x
=================================================================== test session starts ===================================================================
platform linux -- Python 3.12.3, pytest-8.3.5, pluggy-1.6.0 -- /home/yann/dev/yann/llama.cpp/tools/server/tests/test/bin/python3
cachedir: .pytest_cache
rootdir: /home/yann/dev/yann/llama.cpp/tools/server/tests
configfile: pytest.ini
plugins: anyio-4.11.0
collected 16 items
unit/test_rerank.py::test_rerank PASSED [ 6%]
unit/test_rerank.py::test_rerank_tei_format PASSED [ 12%]
unit/test_rerank.py::test_invalid_rerank_req[documents0] PASSED [ 18%]
unit/test_rerank.py::test_invalid_rerank_req[None] PASSED [ 25%]
unit/test_rerank.py::test_invalid_rerank_req[123] PASSED [ 31%]
unit/test_rerank.py::test_invalid_rerank_req[documents3] PASSED [ 37%]
unit/test_rerank.py::test_rerank_usage[Machine learning is-A machine-Learning is-19] PASSED [ 43%]
unit/test_rerank.py::test_rerank_usage[Which city?-Machine learning is -Paris, capitale de la-26] PASSED [ 50%]
unit/test_rerank.py::test_rerank_top_n[None-4] PASSED [ 56%]
unit/test_rerank.py::test_rerank_top_n[2-2] PASSED [ 62%]
unit/test_rerank.py::test_rerank_top_n[4-4] PASSED [ 68%]
unit/test_rerank.py::test_rerank_top_n[99-4] PASSED [ 75%]
unit/test_rerank.py::test_rerank_tei_top_n[None-4] PASSED [ 81%]
unit/test_rerank.py::test_rerank_tei_top_n[2-2] PASSED [ 87%]
unit/test_rerank.py::test_rerank_tei_top_n[4-4] PASSED [ 93%]
unit/test_rerank.py::test_rerank_tei_top_n[99-4] PASSED [100%]
=================================================================== 16 passed in 8.84s ===================================================================
* editor config check fix
2025-10-11 21:39:04 +08:00
n_tokens + = json_value ( rank , " tokens_evaluated " , 0 ) ;
if ( return_text ) {
elem [ " text " ] = std : : move ( texts [ index ] ) ;
}
elements . push_back ( elem ) ;
2025-02-18 14:21:41 +01:00
}
2024-09-28 17:42:03 +03:00
server / ranking : add sorting and management of top_n (#16403)
* server / ranking : add sorting and management of top_n
* Make the retro compatible if no top_n will return
all results
here is a script to make some test
```script
URL=${1:-http://127.0.0.1:8181}
curl "$URL/v1/rerank" -H "Content-Type: application/json" \
-d '{ "model": "M", "query": "What is the recipe to make bread ?",
"return_text" : true,
"texts" : true,
"top_n": 6,
"documents": [
"voici la recette pour faire du pain, il faut de la farine de l eau et du levain et du sel",
"it is a bear",
"bread recipe : floor, water, yest, salt",
"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
"here is the ingedients to bake bread : 500g floor, 350g water, 120g fresh refresh yest, 15g salt",
"recipe to make cookies : floor, eggs, water, chocolat",
"here is the recipe to make bread : 500g floor, 350g water, 120g fresh refresh yest, 15g salt",
"il fait tres beau aujourd hui",
"je n ai pas faim, je ne veux pas manger",
"je suis a paris"
] }' | jq
```
* use resize() instead for(...)
* simplify top_n init since no need to return error
result to test :
./tests.sh unit/test_rerank.py -v -x
==================================================== test session starts =====================================================
platform linux -- Python 3.12.3, pytest-8.3.5, pluggy-1.6.0 -- /home/yann/dev/yann/llama.cpp/tools/server/tests/test/bin/python3
cachedir: .pytest_cache
rootdir: /home/yann/dev/yann/llama.cpp/tools/server/tests
configfile: pytest.ini
plugins: anyio-4.11.0
collected 8 items
unit/test_rerank.py::test_rerank PASSED [ 12%]
unit/test_rerank.py::test_rerank_tei_format PASSED [ 25%]
unit/test_rerank.py::test_invalid_rerank_req[documents0] PASSED [ 37%]
unit/test_rerank.py::test_invalid_rerank_req[None] PASSED [ 50%]
unit/test_rerank.py::test_invalid_rerank_req[123] PASSED [ 62%]
unit/test_rerank.py::test_invalid_rerank_req[documents3] PASSED [ 75%]
unit/test_rerank.py::test_rerank_usage[Machine learning is-A machine-Learning is-19] PASSED [ 87%]
unit/test_rerank.py::test_rerank_usage[Which city?-Machine learning is -Paris, capitale de la-26] PASSED [100%]
===================================================== 8 passed in 4.31s ======================================================
* add rerank top_n unit test
here is the result :
./tests.sh unit/test_rerank.py -v -x
=================================================================== test session starts ===================================================================
platform linux -- Python 3.12.3, pytest-8.3.5, pluggy-1.6.0 -- /home/yann/dev/yann/llama.cpp/tools/server/tests/test/bin/python3
cachedir: .pytest_cache
rootdir: /home/yann/dev/yann/llama.cpp/tools/server/tests
configfile: pytest.ini
plugins: anyio-4.11.0
collected 16 items
unit/test_rerank.py::test_rerank PASSED [ 6%]
unit/test_rerank.py::test_rerank_tei_format PASSED [ 12%]
unit/test_rerank.py::test_invalid_rerank_req[documents0] PASSED [ 18%]
unit/test_rerank.py::test_invalid_rerank_req[None] PASSED [ 25%]
unit/test_rerank.py::test_invalid_rerank_req[123] PASSED [ 31%]
unit/test_rerank.py::test_invalid_rerank_req[documents3] PASSED [ 37%]
unit/test_rerank.py::test_rerank_usage[Machine learning is-A machine-Learning is-19] PASSED [ 43%]
unit/test_rerank.py::test_rerank_usage[Which city?-Machine learning is -Paris, capitale de la-26] PASSED [ 50%]
unit/test_rerank.py::test_rerank_top_n[None-4] PASSED [ 56%]
unit/test_rerank.py::test_rerank_top_n[2-2] PASSED [ 62%]
unit/test_rerank.py::test_rerank_top_n[4-4] PASSED [ 68%]
unit/test_rerank.py::test_rerank_top_n[99-4] PASSED [ 75%]
unit/test_rerank.py::test_rerank_tei_top_n[None-4] PASSED [ 81%]
unit/test_rerank.py::test_rerank_tei_top_n[2-2] PASSED [ 87%]
unit/test_rerank.py::test_rerank_tei_top_n[4-4] PASSED [ 93%]
unit/test_rerank.py::test_rerank_tei_top_n[99-4] PASSED [100%]
=================================================================== 16 passed in 8.84s ===================================================================
* editor config check fix
2025-10-11 21:39:04 +08:00
std : : sort ( elements . begin ( ) , elements . end ( ) , [ score_label ] ( const json & a , const json & b ) {
return json_value ( a , score_label , 0.0 ) > json_value ( b , score_label , 0.0 ) ;
} ) ;
elements . resize ( std : : min ( top_n , ( int ) elements . size ( ) ) ) ;
json results = elements ;
if ( is_tei_format ) return results ;
json res = json {
{ " model " , json_value ( request , " model " , std : : string ( DEFAULT_OAICOMPAT_MODEL ) ) } ,
{ " object " , " list " } ,
{ " usage " , json {
{ " prompt_tokens " , n_tokens } ,
{ " total_tokens " , n_tokens }
} } ,
{ " results " , results }
} ;
2024-09-28 17:42:03 +03:00
return res ;
}
2024-09-12 22:30:11 +02:00
static bool is_valid_utf8 ( const std : : string & str ) {
const unsigned char * bytes = reinterpret_cast < const unsigned char * > ( str . data ( ) ) ;
const unsigned char * end = bytes + str . length ( ) ;
while ( bytes < end ) {
if ( * bytes < = 0x7F ) {
// 1-byte sequence (0xxxxxxx)
bytes + + ;
} else if ( ( * bytes & 0xE0 ) = = 0xC0 ) {
// 2-byte sequence (110xxxxx 10xxxxxx)
if ( end - bytes < 2 | | ( bytes [ 1 ] & 0xC0 ) ! = 0x80 )
return false ;
bytes + = 2 ;
} else if ( ( * bytes & 0xF0 ) = = 0xE0 ) {
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
if ( end - bytes < 3 | | ( bytes [ 1 ] & 0xC0 ) ! = 0x80 | | ( bytes [ 2 ] & 0xC0 ) ! = 0x80 )
return false ;
bytes + = 3 ;
} else if ( ( * bytes & 0xF8 ) = = 0xF0 ) {
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if ( end - bytes < 4 | | ( bytes [ 1 ] & 0xC0 ) ! = 0x80 | |
( bytes [ 2 ] & 0xC0 ) ! = 0x80 | | ( bytes [ 3 ] & 0xC0 ) ! = 0x80 )
return false ;
bytes + = 4 ;
} else {
// Invalid UTF-8 lead byte
return false ;
}
}
return true ;
}
static json format_tokenizer_response ( const json & tokens ) {
2024-03-07 11:41:53 +02:00
return json {
{ " tokens " , tokens }
} ;
}
static json format_detokenized_response ( const std : : string & content ) {
return json {
{ " content " , content }
} ;
}
2024-12-07 20:21:09 +01:00
static json format_logit_bias ( const std : : vector < llama_logit_bias > & logit_bias ) {
json data = json : : array ( ) ;
for ( const auto & lb : logit_bias ) {
data . push_back ( json {
{ " bias " , lb . bias } ,
{ " token " , lb . token } ,
} ) ;
}
return data ;
}
2025-01-12 11:32:42 +02:00
static std : : string safe_json_to_str ( const json & data ) {
2024-12-07 20:21:09 +01:00
return data . dump ( - 1 , ' ' , false , json : : error_handler_t : : replace ) ;
}
2024-12-19 15:40:08 +01:00
static std : : vector < llama_token_data > get_token_probabilities ( llama_context * ctx , int idx ) {
std : : vector < llama_token_data > cur ;
const auto * logits = llama_get_logits_ith ( ctx , idx ) ;
2025-01-12 11:32:42 +02:00
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
const int n_vocab = llama_vocab_n_tokens ( vocab ) ;
2024-12-19 15:40:08 +01:00
cur . resize ( n_vocab ) ;
for ( llama_token token_id = 0 ; token_id < n_vocab ; token_id + + ) {
cur [ token_id ] = llama_token_data { token_id , logits [ token_id ] , 0.0f } ;
}
// sort tokens by logits
std : : sort ( cur . begin ( ) , cur . end ( ) , [ ] ( const llama_token_data & a , const llama_token_data & b ) {
return a . logit > b . logit ;
} ) ;
// apply softmax
float max_l = cur [ 0 ] . logit ;
float cum_sum = 0.0f ;
for ( size_t i = 0 ; i < cur . size ( ) ; + + i ) {
float p = expf ( cur [ i ] . logit - max_l ) ;
cur [ i ] . p = p ;
cum_sum + = p ;
}
for ( size_t i = 0 ; i < cur . size ( ) ; + + i ) {
cur [ i ] . p / = cum_sum ;
}
return cur ;
}
2025-01-02 15:05:18 +01:00
static bool are_lora_equal (
2025-01-12 11:32:42 +02:00
const std : : vector < common_adapter_lora_info > & l1 ,
const std : : vector < common_adapter_lora_info > & l2 ) {
2025-01-02 15:05:18 +01:00
if ( l1 . size ( ) ! = l2 . size ( ) ) {
return false ;
}
for ( size_t i = 0 ; i < l1 . size ( ) ; + + i ) {
// we don't check lora.path to reduce the time complexity
2025-01-03 10:18:53 +02:00
if ( l1 [ i ] . scale ! = l2 [ i ] . scale | | l1 [ i ] . ptr ! = l2 [ i ] . ptr ) {
2025-01-02 15:05:18 +01:00
return false ;
}
}
return true ;
}
aLoRA Support (#15327)
* feat: Add python-side constants and conversion for adapter.lora.invocation_string
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add c++ side constants for adapter.lora.invocation_string
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Parse invocation string for adapters from GGUF
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix(python): Update conversion to alora_invocation_tokens
This is the preferred method in PEFT which is the source of ground truth
https://github.com/huggingface/peft/pull/2609/files#diff-13380145401d203d5935c5189dd09879f990b81aa63e8e3aaff8ce9110333f0e
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix(cpp): Update to alora_invocation_tokens on c++ side
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add C APIs to get alora invocation token array from lora
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Initial implementation of alora cache logic in server
This does not yet do the part to identify the invocation tokens and only
apply the lora adapter afterwards, but it does seem to produce correct
results if the invocation tokens are the beginning of the uncached input.
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Identify alora invocation sequences
This currently limits to a single enabled alora per slot. Multiple aloras
with different invocation sequences would be possible, but it would require
a more complex integration of the adapter toggling and is not really a well
studied case for alora since it's unclear if one alora can reuse cache from
previous prefill computed with a different alora.
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Only reuse cache for tokens before the alora invocation start
This is a bit of an edge case, but theoretically a user could try the same
query with the alora disabled (just using the base model), then retry with
the alora. The cached tokens from the first pass should be invalid.
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Handle un-cached tokens that come before the alora activation
The solution is to only fill up to the token before the invocation start in
the batch if there are any tokens to be prefilled between those pulled from
cache and the invocation start. When this is detected, the alora is
temporarily disabled with a scale of 0.0, then immediately re-enabled after
it has been initialized for the internal graph. Since the batch does not
complete the prompt tokens, the remaining prompt tokens are handled in the
next task, pulling all of the non-alora tokens from cache and proceeding
with prefill for the alora tokens.
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use || instead of 'or'
Too much python :facepalm:
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix off-by-one for limiting cached tokens to before alora start
This was the cause of the inconsistent results from the dummy test script
with and without the turn that runs the prompt without the adapter before
running it with the adapter.
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Support backwards-compatibility for "invocation_string" in adapter_config.json
While this has been replaced in the PEFT PR in favor of
alora_invocation_tokens, the existing adapters in the ibm-granite org on HF
use "invocation_string," so this will enable backwards compatibility and
enable testing now (before PEFT PR changes have percolated everywhere).
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove duplicate logging
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* feat: Report alora_invocation_string and alora_invocation_tokens from /lora-adapters
Branch: gabe-l-hart/alora-support
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---------
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-05 17:32:39 -06:00
// get the ids of all enabled loras
static std : : vector < size_t > lora_get_enabled_ids ( const std : : vector < common_adapter_lora_info > & loras ) {
std : : vector < size_t > enabled_ids ;
for ( size_t i = 0 ; i < loras . size ( ) ; + + i ) {
if ( loras [ i ] . scale > 0 ) {
enabled_ids . push_back ( i ) ;
}
}
return enabled_ids ;
}
// check whether the given lora set has only aloras activated (empty => false)
static bool lora_all_alora ( const std : : vector < common_adapter_lora_info > & loras ) {
bool found_alora = false ;
for ( const auto & lora : loras ) {
if ( lora . scale ! = 0 ) {
if ( llama_adapter_get_alora_n_invocation_tokens ( lora . ptr ) = = 0 ) {
return false ;
}
found_alora = true ;
}
}
return found_alora ;
}
// if the two sets of loras are different, they require a cache clear unless the
// change is only from aloras to aloras.
static bool lora_should_clear_cache (
const std : : vector < common_adapter_lora_info > & current ,
const std : : vector < common_adapter_lora_info > & next ) {
// This should always be called after determining that the two sets are
// _not_ equal. This assert is therefore some slightly wasted work and
// should be safe to remove as long as this method is called correctly.
GGML_ASSERT ( ! are_lora_equal ( current , next ) ) ;
return (
! ( lora_get_enabled_ids ( current ) . empty ( ) | | lora_all_alora ( current ) ) | |
! lora_all_alora ( next ) ) ;
}
2025-01-03 10:18:53 +02:00
// parse lora config from JSON request, returned a copy of lora_base with updated scale
2025-01-12 11:32:42 +02:00
static std : : vector < common_adapter_lora_info > parse_lora_request (
const std : : vector < common_adapter_lora_info > & lora_base ,
2025-01-02 15:05:18 +01:00
const json & data ) {
2025-01-12 11:32:42 +02:00
std : : vector < common_adapter_lora_info > lora ( lora_base ) ;
2025-01-02 15:05:18 +01:00
int max_idx = lora . size ( ) ;
// clear existing value
for ( auto & entry : lora ) {
entry . scale = 0.0f ;
}
// set value
for ( const auto & entry : data ) {
int id = json_value ( entry , " id " , - 1 ) ;
float scale = json_value ( entry , " scale " , 0.0f ) ;
if ( 0 < = id & & id < max_idx ) {
lora [ id ] . scale = scale ;
} else {
throw std : : runtime_error ( " invalid adapter id " ) ;
}
}
return lora ;
}
2025-05-09 19:29:37 +02:00
//
// utils for interacting with libmtmd
// (may need to refactor in near future)
//
/**
* server_tokens is a helper to manage the input tokens and image for the server .
* it is made this way to simplify the logic of KV cache management .
*/
struct server_tokens {
bool has_mtmd = false ;
private : // disallow accessing these members directly, risking out-of-sync
2025-10-30 18:42:57 +02:00
// map a **start** index in tokens to the image chunk
// note: the order need to be in-sync with tokens
std : : map < size_t , mtmd : : input_chunk_ptr > map_idx_to_media ;
2025-05-09 19:29:37 +02:00
// list of tokens
2025-10-30 18:42:57 +02:00
// if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
// otherwise, it is a normal text token
// note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
// note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
2025-05-09 19:29:37 +02:00
llama_tokens tokens ;
2025-10-30 18:42:57 +02:00
// for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
// idx 0 1 2 3 4 5 6 7 8 9 10
// pos 0 1 2 3 4 5 5 5 7 7 7
// map_idx_to_media will contain: {5, img0}, {8, img1}
2025-05-09 19:29:37 +02:00
public :
server_tokens ( ) = default ;
~ server_tokens ( ) = default ;
// Prevent copying
2025-10-09 18:54:51 +03:00
// TODO: server_tokens should be copyable - remove this:
2025-05-09 19:29:37 +02:00
server_tokens ( const server_tokens & ) = delete ;
server_tokens & operator = ( const server_tokens & ) = delete ;
// Allow moving (usually implicitly generated if members are movable)
server_tokens ( server_tokens & & ) = default ;
server_tokens & operator = ( server_tokens & & ) = default ;
// Allow accessing elements using [] operator
llama_token operator [ ] ( size_t index ) { return tokens [ index ] ; }
const llama_token & operator [ ] ( size_t index ) const { return tokens [ index ] ; }
server_tokens ( mtmd : : input_chunks & mtmd_chunks , bool has_mtmd ) : has_mtmd ( has_mtmd ) {
for ( size_t i = 0 ; i < mtmd_chunks . size ( ) ; + + i ) {
push_back ( mtmd_chunks [ i ] ) ;
}
}
2025-10-30 18:42:57 +02:00
server_tokens ( const llama_tokens & tokens , bool has_mtmd ) : has_mtmd ( has_mtmd ) , tokens ( tokens ) {
}
llama_pos pos_next ( ) const {
if ( ! has_mtmd ) {
return tokens . size ( ) ;
}
llama_pos res = tokens . size ( ) ;
for ( auto it = map_idx_to_media . begin ( ) ; it ! = map_idx_to_media . end ( ) ; + + it ) {
const auto & chunk = it - > second ;
res + = mtmd_input_chunk_get_n_pos ( chunk . get ( ) ) - mtmd_input_chunk_get_n_tokens ( chunk . get ( ) ) ;
}
return res ;
}
2025-05-09 19:29:37 +02:00
// for debugging
std : : string str ( ) const {
std : : ostringstream oss ;
oss < < " tokens: " ;
2025-10-30 18:42:57 +02:00
for ( size_t idx = 0 ; idx < tokens . size ( ) ; + + idx ) {
llama_token t = tokens [ idx ] ;
oss < < " idx: " < < idx < < " " ;
2025-05-09 19:29:37 +02:00
if ( t = = LLAMA_TOKEN_NULL ) {
oss < < " <embd> " ;
} else {
oss < < t < < " " ;
}
}
oss < < " \n " ;
2025-10-30 18:42:57 +02:00
oss < < " image idx: " ;
for ( const auto & it : map_idx_to_media ) {
2025-05-09 19:29:37 +02:00
oss < < it . first < < " , " ;
}
return oss . str ( ) ;
}
2025-10-30 18:42:57 +02:00
const mtmd : : input_chunk_ptr & find_chunk ( size_t idx ) const {
auto it = map_idx_to_media . find ( idx ) ;
if ( it ! = map_idx_to_media . end ( ) ) {
2025-05-09 19:29:37 +02:00
return it - > second ;
}
2025-10-09 18:54:51 +03:00
throw std : : runtime_error ( " Chunk not found " ) ;
2025-05-09 19:29:37 +02:00
}
void push_back ( llama_token tok ) {
if ( tok = = LLAMA_TOKEN_NULL ) {
throw std : : runtime_error ( " Invalid token " ) ;
}
tokens . emplace_back ( tok ) ;
}
// will create a copy of the chunk if it contains non-text data
void push_back ( const mtmd_input_chunk * chunk ) {
auto type = mtmd_input_chunk_get_type ( chunk ) ;
2025-05-23 11:03:47 +02:00
if ( type = = MTMD_INPUT_CHUNK_TYPE_IMAGE | | type = = MTMD_INPUT_CHUNK_TYPE_AUDIO ) {
2025-05-09 19:29:37 +02:00
GGML_ASSERT ( has_mtmd ) ;
2025-10-30 18:42:57 +02:00
const size_t n_tokens = mtmd_input_chunk_get_n_tokens ( chunk ) ;
size_t start_idx = tokens . size ( ) ;
for ( size_t i = 0 ; i < n_tokens ; + + i ) {
2025-05-09 19:29:37 +02:00
tokens . emplace_back ( LLAMA_TOKEN_NULL ) ;
}
mtmd : : input_chunk_ptr new_chunk ( mtmd_input_chunk_copy ( chunk ) ) ;
2025-10-30 18:42:57 +02:00
map_idx_to_media [ start_idx ] = std : : move ( new_chunk ) ;
2025-05-09 19:29:37 +02:00
} else if ( type = = MTMD_INPUT_CHUNK_TYPE_TEXT ) {
size_t n_tokens ;
2025-10-09 18:54:51 +03:00
const auto * text_tokens = mtmd_input_chunk_get_tokens_text ( chunk , & n_tokens ) ;
2025-05-09 19:29:37 +02:00
for ( size_t i = 0 ; i < n_tokens ; + + i ) {
push_back ( text_tokens [ i ] ) ;
}
} else {
GGML_ABORT ( " Invalid chunk type " ) ;
}
}
2025-08-22 08:10:14 +00:00
// appends server tokens, updates the media map. copies media chunks.
void push_back ( server_tokens & tokens ) {
2025-10-30 18:42:57 +02:00
size_t start_idx = size ( ) ;
2025-08-22 08:10:14 +00:00
for ( size_t i = 0 ; i < tokens . size ( ) ; i + + ) {
push_back ( tokens [ i ] ) ;
}
if ( tokens . has_mtmd ) {
// Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
// We could also just check, but this will prevent silently dropping MTMD data.
GGML_ASSERT ( has_mtmd ) ;
2025-10-30 18:42:57 +02:00
for ( auto it = tokens . map_idx_to_media . begin ( ) ; it ! = tokens . map_idx_to_media . end ( ) ; ) {
auto * chunk = tokens . map_idx_to_media [ it - > first ] . get ( ) ;
2025-08-22 08:10:14 +00:00
mtmd : : input_chunk_ptr new_chunk ( mtmd_input_chunk_copy ( chunk ) ) ;
2025-11-02 18:14:04 +02:00
map_idx_to_media [ start_idx + it - > first ] = std : : move ( new_chunk ) ;
2025-08-22 08:10:14 +00:00
}
}
}
2025-05-09 19:29:37 +02:00
// for compatibility with context shift and prompt truncation
void insert ( const llama_tokens & inp_tokens ) {
GGML_ASSERT ( ! has_mtmd ) ; // only allow this if mtmd is disabled
tokens . insert ( tokens . end ( ) , inp_tokens . begin ( ) , inp_tokens . end ( ) ) ;
}
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens ( ) const {
GGML_ASSERT ( ! has_mtmd ) ; // only allow this if mtmd is disabled
return tokens ;
}
// for compatibility with speculative decoding
void set_token ( llama_pos pos , llama_token id ) {
GGML_ASSERT ( ! has_mtmd ) ; // only allow this if mtmd is disabled
tokens [ pos ] = id ;
}
size_t size ( ) const {
return tokens . size ( ) ;
}
bool empty ( ) const {
return tokens . empty ( ) ;
}
void clear ( ) {
2025-11-02 18:14:04 +02:00
map_idx_to_media . clear ( ) ;
2025-05-09 19:29:37 +02:00
tokens . clear ( ) ;
}
2025-05-14 13:35:07 +02:00
void keep_first ( size_t n ) {
2025-05-09 19:29:37 +02:00
GGML_ASSERT ( n < = tokens . size ( ) ) ;
if ( has_mtmd ) {
2025-05-23 11:03:47 +02:00
if ( n = = tokens . size ( ) ) {
return ; // nothing to do
}
2025-05-09 19:29:37 +02:00
// we throw an error if we try to remove a token in the middle of an image
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// n 1 2 3 4 5 6 7 8 9 10
// allowed to resize ^ ^
// disallowed to resize ^ ^ ^
if ( n > 0 ) {
// make sure we never remove tokens in the middle of an image
2025-10-15 12:51:27 +03:00
// note that the case where we keep a full image at the end is allowed:
// tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
if ( tokens [ n - 1 ] = = LLAMA_TOKEN_NULL & & tokens [ n ] = = LLAMA_TOKEN_NULL ) {
2025-05-09 19:29:37 +02:00
find_chunk ( n - 1 ) ; // will throw an error if the token is not begin-of-chunk
}
}
// remove all image chunks that are not used anymore
2025-10-30 18:42:57 +02:00
for ( auto it = map_idx_to_media . begin ( ) ; it ! = map_idx_to_media . end ( ) ; ) {
size_t idx = it - > first ;
if ( idx > = n ) {
it = map_idx_to_media . erase ( it ) ;
2025-05-09 19:29:37 +02:00
} else {
+ + it ;
}
}
}
tokens . resize ( n ) ;
}
std : : string detokenize ( const llama_context * ctx , bool special ) const {
llama_tokens text_tokens ;
text_tokens . reserve ( tokens . size ( ) ) ;
for ( const auto & t : tokens ) {
if ( t ! = LLAMA_TOKEN_NULL ) {
text_tokens . push_back ( t ) ;
}
}
return common_detokenize ( ctx , text_tokens , special ) ;
}
size_t get_common_prefix ( const server_tokens & b ) const {
2025-10-09 18:54:51 +03:00
const size_t max_idx = std : : min ( tokens . size ( ) , b . tokens . size ( ) ) ;
if ( ! has_mtmd ) {
for ( size_t i = 0 ; i < max_idx ; + + i ) {
if ( tokens [ i ] = = b . tokens [ i ] ) {
continue ;
}
return i ;
}
return max_idx ;
}
2025-05-09 19:29:37 +02:00
for ( size_t i = 0 ; i < max_idx ; + + i ) {
2025-10-09 18:54:51 +03:00
const llama_token ai = tokens [ i ] ;
const llama_token bi = b . tokens [ i ] ;
2025-05-09 19:29:37 +02:00
if ( ai = = LLAMA_TOKEN_NULL & & bi = = LLAMA_TOKEN_NULL ) {
const auto & a_chunk = find_chunk ( i ) ;
const auto & b_chunk = b . find_chunk ( i ) ;
2025-10-09 18:54:51 +03:00
2025-05-09 19:29:37 +02:00
GGML_ASSERT ( a_chunk & & b_chunk ) ;
2025-10-09 18:54:51 +03:00
const std : : string id_ai = mtmd_input_chunk_get_id ( a_chunk . get ( ) ) ;
const std : : string id_bi = mtmd_input_chunk_get_id ( b_chunk . get ( ) ) ;
2025-10-30 18:42:57 +02:00
const size_t n_tok_a = mtmd_input_chunk_get_n_tokens ( a_chunk . get ( ) ) ;
const size_t n_tok_b = mtmd_input_chunk_get_n_tokens ( b_chunk . get ( ) ) ;
2025-10-09 18:54:51 +03:00
2025-10-30 18:42:57 +02:00
if ( id_ai = = id_bi & & n_tok_a = = n_tok_b ) {
GGML_ASSERT ( n_tok_a > 0 & & " Invalid media chunk " ) ; // should never happen
i + = n_tok_a - 1 ; // will be +1 by the for loop
2025-05-09 19:29:37 +02:00
continue ;
}
2025-10-09 18:54:51 +03:00
2025-05-09 19:29:37 +02:00
return i ;
}
2025-10-09 18:54:51 +03:00
if ( ai = = bi ) {
continue ;
}
return i ;
2025-05-09 19:29:37 +02:00
}
2025-10-09 18:54:51 +03:00
2025-05-09 19:29:37 +02:00
return max_idx ; // all tokens are equal
}
// make sure all text tokens are within the vocab range
bool validate ( const struct llama_context * ctx ) const {
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
const int32_t n_vocab = llama_vocab_n_tokens ( vocab ) ;
for ( size_t i = 0 ; i < tokens . size ( ) ; + + i ) {
2025-10-09 18:54:51 +03:00
const auto & t = tokens [ i ] ;
2025-05-09 19:29:37 +02:00
if ( t = = LLAMA_TOKEN_NULL ) {
try {
const auto & chunk = find_chunk ( i ) ;
2025-10-30 18:42:57 +02:00
size_t n_tokens = mtmd_input_chunk_get_n_tokens ( chunk . get ( ) ) ;
i + = n_tokens - 1 ; // will be +1 by the for loop
2025-05-09 19:29:37 +02:00
} catch ( const std : : exception & e ) {
return false ;
}
} else if ( t < 0 | | t > = n_vocab ) {
return false ;
}
}
return true ;
}
// encode and decode the image chunk
int32_t process_chunk (
llama_context * ctx ,
mtmd_context * mctx ,
2025-10-30 18:42:57 +02:00
size_t idx ,
llama_pos pos ,
2025-05-09 19:29:37 +02:00
int32_t seq_id ,
2025-10-30 18:42:57 +02:00
size_t & n_tokens_out ) const {
const auto & chunk = find_chunk ( idx ) ;
2025-05-23 11:03:47 +02:00
const char * name = mtmd_input_chunk_get_type ( chunk . get ( ) ) = = MTMD_INPUT_CHUNK_TYPE_IMAGE
? " image " : " audio " ;
SRV_INF ( " processing %s... \n " , name ) ;
2025-05-09 19:29:37 +02:00
int32_t n_batch = llama_n_batch ( ctx ) ;
int64_t t0 = ggml_time_ms ( ) ;
2025-10-30 18:42:57 +02:00
llama_pos new_n_past ; // unused for now
2025-05-09 19:29:37 +02:00
int32_t result = mtmd_helper_eval_chunk_single ( mctx , ctx ,
2025-05-23 11:03:47 +02:00
chunk . get ( ) ,
2025-10-30 18:42:57 +02:00
pos ,
2025-05-09 19:29:37 +02:00
seq_id ,
n_batch ,
true , // logits last
& new_n_past ) ;
2025-05-23 11:03:47 +02:00
SRV_INF ( " %s processed in % " PRId64 " ms \n " , name , ggml_time_ms ( ) - t0 ) ;
2025-05-09 19:29:37 +02:00
if ( result ! = 0 ) {
LOG_ERR ( " mtmd_helper_eval failed with status %d " , result ) ;
2025-10-30 18:42:57 +02:00
n_tokens_out = 0 ;
2025-05-09 19:29:37 +02:00
return result ;
}
2025-10-30 18:42:57 +02:00
n_tokens_out = mtmd_input_chunk_get_n_tokens ( chunk . get ( ) ) ;
2025-05-09 19:29:37 +02:00
return 0 ;
}
} ;
// Computes FNV-1a hash of the data
static std : : string fnv_hash ( const uint8_t * data , size_t len ) {
const uint64_t fnv_prime = 0x100000001b3ULL ;
uint64_t hash = 0xcbf29ce484222325ULL ;
for ( size_t i = 0 ; i < len ; + + i ) {
hash ^ = data [ i ] ;
hash * = fnv_prime ;
}
return std : : to_string ( hash ) ;
}
2025-08-22 08:10:14 +00:00
static server_tokens process_mtmd_prompt ( mtmd_context * mctx , std : : string prompt , std : : vector < raw_buffer > files ) {
mtmd : : bitmaps bitmaps ;
for ( auto & file : files ) {
mtmd : : bitmap bmp ( mtmd_helper_bitmap_init_from_buf ( mctx , file . data ( ) , file . size ( ) ) ) ;
if ( ! bmp . ptr ) {
throw std : : runtime_error ( " Failed to load image or audio file " ) ;
}
// calculate bitmap hash (for KV caching)
std : : string hash = fnv_hash ( bmp . data ( ) , bmp . n_bytes ( ) ) ;
bmp . set_id ( hash . c_str ( ) ) ;
bitmaps . entries . push_back ( std : : move ( bmp ) ) ;
}
// process prompt
std : : vector < server_tokens > inputs ;
// multimodal
mtmd_input_text inp_txt = {
prompt . c_str ( ) ,
/* add_special */ true ,
/* parse_special */ true ,
} ;
mtmd : : input_chunks chunks ( mtmd_input_chunks_init ( ) ) ;
auto bitmaps_c_ptr = bitmaps . c_ptr ( ) ;
int32_t tokenized = mtmd_tokenize ( mctx ,
chunks . ptr . get ( ) ,
& inp_txt ,
bitmaps_c_ptr . data ( ) ,
bitmaps_c_ptr . size ( ) ) ;
if ( tokenized ! = 0 ) {
throw std : : runtime_error ( " Failed to tokenize prompt " ) ;
}
auto result = server_tokens ( chunks , true ) ;
return result ;
}
/**
* break the input " prompt " object into multiple prompt if needed , then tokenize them
* use tokenize_input_prompts ( ) if the input could be an array .
* this supports these cases :
* - " prompt " : " string "
* - " prompt " : [ 12 , 34 , 56 ]
* - " prompt " : [ 12 , 34 , " string " , 56 , 78 ]
* - " prompt " : { " prompt_string " : " string " , " multimodal_data " : [ " base64 " ] }
*/
static server_tokens tokenize_input_subprompt ( const llama_vocab * vocab , mtmd_context * mctx , const json & json_prompt , bool add_special , bool parse_special ) {
constexpr char JSON_STRING_PROMPT_KEY [ ] = " prompt_string " ;
constexpr char JSON_MTMD_DATA_KEY [ ] = " multimodal_data " ;
const bool has_mtmd = mctx ! = nullptr ;
if ( json_prompt . is_string ( ) | | json_is_array_of_mixed_numbers_strings ( json_prompt ) ) {
// string or mixed
llama_tokens tmp = tokenize_mixed ( vocab , json_prompt , add_special , parse_special ) ;
return server_tokens ( tmp , false ) ;
} else if ( json_is_array_of_numbers ( json_prompt ) ) {
// array of tokens
llama_tokens tmp = json_prompt . get < llama_tokens > ( ) ;
return server_tokens ( tmp , false ) ;
} else if ( json_prompt . contains ( JSON_STRING_PROMPT_KEY ) ) {
// JSON object with prompt key.
if ( json_prompt . contains ( JSON_MTMD_DATA_KEY ) ) {
if ( ! has_mtmd )
throw std : : runtime_error ( " Multimodal data provided, but model does not support multimodal requests. " ) ;
// JSON object with prompt and multimodal key.
std : : vector < raw_buffer > files ;
for ( const auto & entry : json_prompt . at ( JSON_MTMD_DATA_KEY ) ) {
files . push_back ( base64_decode ( entry ) ) ;
}
return process_mtmd_prompt ( mctx , json_prompt . at ( JSON_STRING_PROMPT_KEY ) , files ) ;
} else {
// Not multimodal, but contains a subobject.
llama_tokens tmp = tokenize_mixed ( vocab , json_prompt . at ( JSON_STRING_PROMPT_KEY ) , add_special , parse_special ) ;
return server_tokens ( tmp , false ) ;
}
} else {
throw std : : runtime_error ( " \" prompt \" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens. " ) ;
}
}
/**
* break the input " prompt " object into multiple prompt if needed , then tokenize them
* this supports these cases :
* - " prompt " : " string "
* - " prompt " : [ 12 , 34 , 56 ]
* - " prompt " : [ 12 , 34 , " string " , 56 , 78 ]
* - " prompt " : { " prompt_string " : " string " , " multimodal_data " : [ " base64 " ] }
* and multiple prompts ( multi - tasks ) :
* - " prompt " : [ " string1 " , " string2 " ]
* - " prompt " : [ " string1 " , [ 12 , 34 , 56 ] ]
* - " prompt " : [[12, 34, 56], [78, 90, 12]]
* - " prompt " : [ [ 12 , 34 , " string " , 56 , 78 ] , [ 12 , 34 , 56 ] , { " prompt_string " : " string " , " multimodal_data " : [ " base64 " ] } ]
*/
static std : : vector < server_tokens > tokenize_input_prompts ( const llama_vocab * vocab , mtmd_context * mctx , const json & json_prompt , bool add_special , bool parse_special ) {
std : : vector < server_tokens > result ;
if ( json_prompt . is_array ( ) & & ! json_is_array_and_contains_numbers ( json_prompt ) ) {
result . reserve ( json_prompt . size ( ) ) ;
for ( const auto & p : json_prompt ) {
result . push_back ( tokenize_input_subprompt ( vocab , mctx , p , add_special , parse_special ) ) ;
}
} else {
result . push_back ( tokenize_input_subprompt ( vocab , mctx , json_prompt , add_special , parse_special ) ) ;
}
if ( result . empty ( ) ) {
throw std : : runtime_error ( " \" prompt \" must not be empty " ) ;
}
return result ;
}
2025-09-25 03:53:09 -05:00
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
static server_tokens format_rerank ( const struct llama_model * model , const struct llama_vocab * vocab , mtmd_context * mctx , const std : : string & query , const std : : string & doc ) {
server_tokens result = { } ;
const char * rerank_prompt = llama_model_chat_template ( model , " rerank " ) ;
if ( rerank_prompt ! = nullptr ) {
std : : string prompt = rerank_prompt ;
string_replace_all ( prompt , " {query} " , query ) ;
string_replace_all ( prompt , " {document} " , doc ) ;
server_tokens tokens = tokenize_input_subprompt ( vocab , mctx , prompt , false , true ) ;
result . push_back ( tokens ) ;
} else {
// Get EOS token - use SEP token as fallback if EOS is not available
server_tokens query_tokens = tokenize_input_subprompt ( vocab , mctx , query , false , false ) ;
server_tokens doc_tokens = tokenize_input_subprompt ( vocab , mctx , doc , false , false ) ;
llama_token eos_token = llama_vocab_eos ( vocab ) ;
if ( eos_token = = LLAMA_TOKEN_NULL ) {
eos_token = llama_vocab_sep ( vocab ) ;
}
if ( llama_vocab_get_add_bos ( vocab ) ) {
result . push_back ( llama_vocab_bos ( vocab ) ) ;
}
result . push_back ( query_tokens ) ;
if ( llama_vocab_get_add_eos ( vocab ) ) {
result . push_back ( eos_token ) ;
}
if ( llama_vocab_get_add_sep ( vocab ) ) {
result . push_back ( llama_vocab_sep ( vocab ) ) ;
}
result . push_back ( doc_tokens ) ;
if ( llama_vocab_get_add_eos ( vocab ) ) {
result . push_back ( eos_token ) ;
}
}
return result ;
}