2024-01-26 13:42:20 +01:00
# pragma once
2024-03-07 11:41:53 +02:00
# include "common.h"
2024-09-15 20:46:12 +03:00
# include "log.h"
# include "llama.h"
2025-05-09 19:29:37 +02:00
# include "arg.h" // common_remote_get_content
2025-04-09 11:11:11 +03:00
# include "base64.hpp"
2025-05-09 19:29:37 +02:00
# include "mtmd.h"
2025-05-28 22:35:22 +02:00
# include "mtmd-helper.h"
2025-05-30 16:25:45 +03:00
# include "chat.h"
2024-01-26 13:42:20 +01:00
2024-09-02 17:11:51 +02:00
// increase max payload length to allow use of larger context size
# define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
2025-07-14 13:14:30 +02:00
// increase backlog size to avoid connection resets for >> 1 slots
# define CPPHTTPLIB_LISTEN_BACKLOG 512
2025-02-22 12:46:31 +02:00
// disable Nagle's algorithm
# define CPPHTTPLIB_TCP_NODELAY true
2025-05-30 16:25:45 +03:00
# include <cpp-httplib/httplib.h>
2024-09-02 17:11:51 +02:00
2024-05-08 21:53:08 +02:00
# define JSON_ASSERT GGML_ASSERT
2025-05-30 16:25:45 +03:00
# include <nlohmann/json.hpp>
2024-01-26 13:42:20 +01:00
2024-09-15 20:46:12 +03:00
# include <random>
# include <sstream>
2024-03-07 11:41:53 +02:00
# include <string>
# include <vector>
2024-12-06 11:14:32 +01:00
# include <memory>
2025-05-09 19:29:37 +02:00
# include <cinttypes>
2024-03-07 11:41:53 +02:00
2024-12-14 22:29:45 +00:00
# define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
2024-01-26 13:42:20 +01:00
2024-03-22 13:07:44 +00:00
using json = nlohmann : : ordered_json ;
2024-10-24 21:51:22 +02:00
# define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
# define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
# define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
# define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
# define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
# define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
2024-01-26 13:42:20 +01:00
2025-05-09 19:29:37 +02:00
using raw_buffer = std : : vector < uint8_t > ;
2024-03-07 11:41:53 +02:00
template < typename T >
2024-05-08 13:24:14 +02:00
static T json_value ( const json & body , const std : : string & key , const T & default_value ) {
2024-03-07 11:41:53 +02:00
// Fallback null to default value
2024-05-08 13:24:14 +02:00
if ( body . contains ( key ) & & ! body . at ( key ) . is_null ( ) ) {
2024-04-03 20:09:52 +02:00
try {
2024-05-08 13:24:14 +02:00
return body . at ( key ) ;
2025-09-05 14:31:24 -06:00
} catch ( NLOHMANN_JSON_NAMESPACE : : detail : : type_error const & err ) {
LOG_WRN ( " Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s \n " , key . c_str ( ) , json ( default_value ) . type_name ( ) , err . what ( ) ) ;
2024-04-03 20:09:52 +02:00
return default_value ;
}
} else {
return default_value ;
}
2024-03-07 11:41:53 +02:00
}
2024-02-29 21:42:11 +01:00
2024-12-23 12:02:44 +01:00
const static std : : string build_info ( " b " + std : : to_string ( LLAMA_BUILD_NUMBER ) + " - " + LLAMA_COMMIT ) ;
2025-04-02 09:58:34 +02:00
// thin wrapper around common_grammar_trigger with (de)serialization functions
struct server_grammar_trigger {
common_grammar_trigger value ;
server_grammar_trigger ( ) = default ;
server_grammar_trigger ( const common_grammar_trigger & value ) : value ( value ) { }
server_grammar_trigger ( const json & in ) {
value . type = ( common_grammar_trigger_type ) in . at ( " type " ) . get < int > ( ) ;
value . value = in . at ( " value " ) . get < std : : string > ( ) ;
if ( value . type = = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN ) {
value . token = ( llama_token ) in . at ( " token " ) . get < int > ( ) ;
}
}
json to_json ( ) const {
json out {
{ " type " , ( int ) value . type } ,
{ " value " , value . value } ,
} ;
if ( value . type = = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN ) {
out [ " token " ] = ( int ) value . token ;
}
return out ;
}
} ;
2024-01-26 13:42:20 +01:00
//
2024-10-24 21:51:22 +02:00
// tokenizer and input processing utils
2024-01-26 13:42:20 +01:00
//
2024-10-24 21:51:22 +02:00
static bool json_is_array_of_numbers ( const json & data ) {
if ( data . is_array ( ) ) {
for ( const auto & e : data ) {
if ( ! e . is_number_integer ( ) ) {
return false ;
}
}
return true ;
}
return false ;
}
// is array having BOTH numbers & strings?
static bool json_is_array_of_mixed_numbers_strings ( const json & data ) {
bool seen_string = false ;
bool seen_number = false ;
if ( data . is_array ( ) ) {
for ( const auto & e : data ) {
seen_string | = e . is_string ( ) ;
seen_number | = e . is_number_integer ( ) ;
if ( seen_number & & seen_string ) {
return true ;
}
}
}
return false ;
}
2025-08-22 08:10:14 +00:00
// does array have any individual integers/tokens?
static bool json_is_array_and_contains_numbers ( const json & data ) {
if ( data . is_array ( ) ) {
for ( const auto & e : data ) {
if ( e . is_number_integer ( ) ) {
return true ;
}
}
return false ;
}
return false ;
}
2024-12-24 19:39:49 +03:00
// get value by path(key1 / key2)
static json json_get_nested_values ( const std : : vector < std : : string > & paths , const json & js ) {
json result = json : : object ( ) ;
for ( const std : : string & path : paths ) {
json current = js ;
const auto keys = string_split < std : : string > ( path , /*separator*/ ' / ' ) ;
bool valid_path = true ;
for ( const std : : string & k : keys ) {
if ( valid_path & & current . is_object ( ) & & current . contains ( k ) ) {
current = current [ k ] ;
} else {
valid_path = false ;
}
}
if ( valid_path ) {
result [ path ] = current ;
}
}
return result ;
}
2024-10-24 21:51:22 +02:00
/**
* this handles 2 cases :
* - only string , example : " string "
* - mixed string and tokens , example : [ 12 , 34 , " string " , 56 , 78 ]
*/
2025-01-12 11:32:42 +02:00
static llama_tokens tokenize_mixed ( const llama_vocab * vocab , const json & json_prompt , bool add_special , bool parse_special ) {
2024-10-24 21:51:22 +02:00
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
llama_tokens prompt_tokens ;
if ( json_prompt . is_array ( ) ) {
bool first = true ;
for ( const auto & p : json_prompt ) {
if ( p . is_string ( ) ) {
auto s = p . template get < std : : string > ( ) ;
llama_tokens p ;
if ( first ) {
2025-01-12 11:32:42 +02:00
p = common_tokenize ( vocab , s , add_special , parse_special ) ;
2024-10-24 21:51:22 +02:00
first = false ;
} else {
2025-01-12 11:32:42 +02:00
p = common_tokenize ( vocab , s , false , parse_special ) ;
2024-10-24 21:51:22 +02:00
}
prompt_tokens . insert ( prompt_tokens . end ( ) , p . begin ( ) , p . end ( ) ) ;
} else {
if ( first ) {
first = false ;
}
prompt_tokens . push_back ( p . template get < llama_token > ( ) ) ;
}
}
} else {
auto s = json_prompt . template get < std : : string > ( ) ;
2025-01-12 11:32:42 +02:00
prompt_tokens = common_tokenize ( vocab , s , add_special , parse_special ) ;
2024-10-24 21:51:22 +02:00
}
return prompt_tokens ;
}
2024-12-19 15:40:08 +01:00
// return the last index of character that can form a valid string
// if the last character is potentially cut in half, return the index before the cut
// if validate_utf8(text) == text.size(), then the whole text is valid utf8
static size_t validate_utf8 ( const std : : string & text ) {
size_t len = text . size ( ) ;
if ( len = = 0 ) return 0 ;
// Check the last few bytes to see if a multi-byte character is cut off
for ( size_t i = 1 ; i < = 4 & & i < = len ; + + i ) {
unsigned char c = text [ len - i ] ;
// Check for start of a multi-byte sequence from the end
if ( ( c & 0xE0 ) = = 0xC0 ) {
// 2-byte character start: 110xxxxx
// Needs at least 2 bytes
if ( i < 2 ) return len - i ;
} else if ( ( c & 0xF0 ) = = 0xE0 ) {
// 3-byte character start: 1110xxxx
// Needs at least 3 bytes
if ( i < 3 ) return len - i ;
} else if ( ( c & 0xF8 ) = = 0xF0 ) {
// 4-byte character start: 11110xxx
// Needs at least 4 bytes
if ( i < 4 ) return len - i ;
}
}
// If no cut-off multi-byte character is found, return full length
return len ;
}
2024-10-24 21:51:22 +02:00
//
// template utils
//
// format infill task
static llama_tokens format_infill (
2025-01-12 11:32:42 +02:00
const llama_vocab * vocab ,
2024-10-24 21:51:22 +02:00
const json & input_prefix ,
const json & input_suffix ,
const json & input_extra ,
const int n_batch ,
const int n_predict ,
const int n_ctx ,
const bool spm_infill ,
const llama_tokens & tokens_prompt
) {
// TODO: optimize this block by reducing memory allocations and movement
// use FIM repo-level pattern:
// ref: https://arxiv.org/pdf/2409.12186
//
// [FIM_REP]myproject
// [FIM_SEP]filename0
// extra chunk 0
// [FIM_SEP]filename1
// extra chunk 1
// ...
// [FIM_SEP]filename
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
//
llama_tokens extra_tokens ;
extra_tokens . reserve ( n_ctx ) ;
2025-01-12 11:32:42 +02:00
auto tokens_prefix = tokenize_mixed ( vocab , input_prefix , false , false ) ;
auto tokens_suffix = tokenize_mixed ( vocab , input_suffix , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
if ( llama_vocab_fim_rep ( vocab ) ! = LLAMA_TOKEN_NULL ) {
2024-10-24 21:51:22 +02:00
// TODO: make project name an input
2025-01-12 11:32:42 +02:00
static const auto k_fim_repo = common_tokenize ( vocab , " myproject \n " , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
extra_tokens . push_back ( llama_vocab_fim_rep ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_fim_repo . begin ( ) , k_fim_repo . end ( ) ) ;
}
for ( const auto & chunk : input_extra ) {
// { "text": string, "filename": string }
const std : : string text = json_value ( chunk , " text " , std : : string ( ) ) ;
const std : : string filename = json_value ( chunk , " filename " , std : : string ( " tmp " ) ) ;
2025-01-12 11:32:42 +02:00
if ( llama_vocab_fim_sep ( vocab ) ! = LLAMA_TOKEN_NULL ) {
const auto k_fim_file = common_tokenize ( vocab , filename + " \n " , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , llama_vocab_fim_sep ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_fim_file . begin ( ) , k_fim_file . end ( ) ) ;
} else {
// chunk separator in binary form to avoid confusing the AI
static const char k_chunk_prefix_str [ ] = { 0x0a , 0x0a , 0x2d , 0x2d , 0x2d , 0x20 , 0x73 , 0x6e , 0x69 , 0x70 , 0x70 , 0x65 , 0x74 , 0x20 , 0x2d , 0x2d , 0x2d , 0x0a , 0x0a , 0x00 } ;
2025-01-12 11:32:42 +02:00
static const auto k_chunk_prefix_tokens = common_tokenize ( vocab , k_chunk_prefix_str , false , false ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_chunk_prefix_tokens . begin ( ) , k_chunk_prefix_tokens . end ( ) ) ;
}
2025-01-12 11:32:42 +02:00
const auto chunk_tokens = common_tokenize ( vocab , text , false , false ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , chunk_tokens . begin ( ) , chunk_tokens . end ( ) ) ;
}
2025-01-12 11:32:42 +02:00
if ( llama_vocab_fim_sep ( vocab ) ! = LLAMA_TOKEN_NULL ) {
2024-10-24 21:51:22 +02:00
// TODO: current filename
2025-01-12 11:32:42 +02:00
static const auto k_fim_file = common_tokenize ( vocab , " filename \n " , false , false ) ;
2024-10-24 21:51:22 +02:00
2025-01-12 11:32:42 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , llama_vocab_fim_sep ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
extra_tokens . insert ( extra_tokens . end ( ) , k_fim_file . begin ( ) , k_fim_file . end ( ) ) ;
}
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
2024-10-28 08:49:32 +02:00
const int n_prefix_take = std : : min < int > ( tokens_prefix . size ( ) , 3 * ( n_batch / 4 ) ) ;
const int n_suffix_take = std : : min < int > ( tokens_suffix . size ( ) , std : : max < int > ( 0 , ( n_batch / 4 ) - ( 2 + tokens_prompt . size ( ) ) ) ) ;
SRV_DBG ( " n_prefix_take = %d, n_suffix_take = %d, total = %d \n " , n_prefix_take , n_suffix_take , ( n_prefix_take + n_suffix_take ) ) ;
2024-10-24 21:51:22 +02:00
// fill the rest of the context with extra chunks
const int n_extra_take = std : : min < int > ( std : : max < int > ( 0 , n_ctx - ( n_batch ) - 2 * n_predict ) , extra_tokens . size ( ) ) ;
tokens_prefix . erase ( tokens_prefix . begin ( ) , tokens_prefix . begin ( ) + tokens_prefix . size ( ) - n_prefix_take ) ;
tokens_suffix . resize ( n_suffix_take ) ;
2025-01-12 11:32:42 +02:00
tokens_prefix . insert ( tokens_prefix . begin ( ) , llama_vocab_fim_pre ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
tokens_prefix . insert ( tokens_prefix . end ( ) , tokens_prompt . begin ( ) , tokens_prompt . end ( ) ) ;
2025-01-12 11:32:42 +02:00
tokens_suffix . insert ( tokens_suffix . begin ( ) , llama_vocab_fim_suf ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix ;
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix ;
2025-01-12 11:32:42 +02:00
if ( llama_vocab_get_add_bos ( vocab ) ) {
embd_inp . insert ( embd_inp . begin ( ) , llama_vocab_bos ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
}
SRV_DBG ( " extra: n_ctx = %d, n_extra_take = %d, n_extra = %d \n " , n_ctx , n_extra_take , ( int ) extra_tokens . size ( ) ) ;
// put the extra context before the FIM prefix
embd_inp . insert ( embd_inp . begin ( ) , extra_tokens . end ( ) - n_extra_take , extra_tokens . end ( ) ) ;
embd_inp . insert ( embd_inp . end ( ) , embd_end . begin ( ) , embd_end . end ( ) ) ;
2025-01-12 11:32:42 +02:00
embd_inp . push_back ( llama_vocab_fim_mid ( vocab ) ) ;
2024-10-24 21:51:22 +02:00
return embd_inp ;
}
2024-01-26 13:42:20 +01:00
//
// base64 utils (TODO: move to common in the future)
//
static const std : : string base64_chars =
" ABCDEFGHIJKLMNOPQRSTUVWXYZ "
" abcdefghijklmnopqrstuvwxyz "
" 0123456789+/ " ;
2024-03-07 11:41:53 +02:00
static inline bool is_base64 ( uint8_t c ) {
2024-01-26 13:42:20 +01:00
return ( isalnum ( c ) | | ( c = = ' + ' ) | | ( c = = ' / ' ) ) ;
}
2025-05-09 19:29:37 +02:00
static inline raw_buffer base64_decode ( const std : : string & encoded_string ) {
2024-01-26 13:42:20 +01:00
int i = 0 ;
int j = 0 ;
int in_ = 0 ;
int in_len = encoded_string . size ( ) ;
uint8_t char_array_4 [ 4 ] ;
uint8_t char_array_3 [ 3 ] ;
2025-05-09 19:29:37 +02:00
raw_buffer ret ;
2024-01-26 13:42:20 +01:00
2024-03-07 11:41:53 +02:00
while ( in_len - - & & ( encoded_string [ in_ ] ! = ' = ' ) & & is_base64 ( encoded_string [ in_ ] ) ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ i + + ] = encoded_string [ in_ ] ; in_ + + ;
2024-03-07 11:41:53 +02:00
if ( i = = 4 ) {
for ( i = 0 ; i < 4 ; i + + ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ i ] = base64_chars . find ( char_array_4 [ i ] ) ;
}
char_array_3 [ 0 ] = ( ( char_array_4 [ 0 ] ) < < 2 ) + ( ( char_array_4 [ 1 ] & 0x30 ) > > 4 ) ;
char_array_3 [ 1 ] = ( ( char_array_4 [ 1 ] & 0xf ) < < 4 ) + ( ( char_array_4 [ 2 ] & 0x3c ) > > 2 ) ;
char_array_3 [ 2 ] = ( ( char_array_4 [ 2 ] & 0x3 ) < < 6 ) + char_array_4 [ 3 ] ;
2024-03-07 11:41:53 +02:00
for ( i = 0 ; ( i < 3 ) ; i + + ) {
2024-01-26 13:42:20 +01:00
ret . push_back ( char_array_3 [ i ] ) ;
}
2024-03-07 11:41:53 +02:00
2024-01-26 13:42:20 +01:00
i = 0 ;
}
}
2024-03-07 11:41:53 +02:00
if ( i ) {
for ( j = i ; j < 4 ; j + + ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ j ] = 0 ;
}
2024-03-07 11:41:53 +02:00
for ( j = 0 ; j < 4 ; j + + ) {
2024-01-26 13:42:20 +01:00
char_array_4 [ j ] = base64_chars . find ( char_array_4 [ j ] ) ;
}
char_array_3 [ 0 ] = ( ( char_array_4 [ 0 ] ) < < 2 ) + ( ( char_array_4 [ 1 ] & 0x30 ) > > 4 ) ;
char_array_3 [ 1 ] = ( ( char_array_4 [ 1 ] & 0xf ) < < 4 ) + ( ( char_array_4 [ 2 ] & 0x3c ) > > 2 ) ;
char_array_3 [ 2 ] = ( ( char_array_4 [ 2 ] & 0x3 ) < < 6 ) + char_array_4 [ 3 ] ;
2024-03-07 11:41:53 +02:00
for ( j = 0 ; j < i - 1 ; j + + ) {
2024-01-26 13:42:20 +01:00
ret . push_back ( char_array_3 [ j ] ) ;
}
}
return ret ;
}
//
// random string / id
//
2024-03-07 11:41:53 +02:00
static std : : string random_string ( ) {
2024-01-26 13:42:20 +01:00
static const std : : string str ( " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " ) ;
std : : random_device rd ;
std : : mt19937 generator ( rd ( ) ) ;
std : : string result ( 32 , ' ' ) ;
for ( int i = 0 ; i < 32 ; + + i ) {
result [ i ] = str [ generator ( ) % str . size ( ) ] ;
}
return result ;
}
2024-03-07 11:41:53 +02:00
static std : : string gen_chatcmplid ( ) {
2024-09-15 20:46:12 +03:00
return " chatcmpl- " + random_string ( ) ;
2024-01-26 13:42:20 +01:00
}
2024-02-29 21:42:11 +01:00
2025-03-10 09:45:29 +00:00
static std : : string gen_tool_call_id ( ) {
return random_string ( ) ;
}
2024-02-29 21:42:11 +01:00
//
// other common utils
//
// TODO: reuse llama_detokenize
template < class Iter >
2024-03-07 11:41:53 +02:00
static std : : string tokens_to_str ( llama_context * ctx , Iter begin , Iter end ) {
2024-02-29 21:42:11 +01:00
std : : string ret ;
2024-03-07 11:41:53 +02:00
for ( ; begin ! = end ; + + begin ) {
2024-10-10 22:57:42 +02:00
ret + = common_token_to_piece ( ctx , * begin ) ;
2024-02-29 21:42:11 +01:00
}
2024-03-07 11:41:53 +02:00
2024-02-29 21:42:11 +01:00
return ret ;
}
// format incomplete utf-8 multibyte character for output
2024-03-07 11:41:53 +02:00
static std : : string tokens_to_output_formatted_string ( const llama_context * ctx , const llama_token token ) {
2025-01-06 10:52:15 +02:00
std : : string out = token = = LLAMA_TOKEN_NULL ? " " : common_token_to_piece ( ctx , token ) ;
2024-03-07 11:41:53 +02:00
2024-02-29 21:42:11 +01:00
// if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token)
2024-03-07 11:41:53 +02:00
if ( out . size ( ) = = 1 & & ( out [ 0 ] & 0x80 ) = = 0x80 ) {
2024-02-29 21:42:11 +01:00
std : : stringstream ss ;
ss < < std : : hex < < ( out [ 0 ] & 0xff ) ;
std : : string res ( ss . str ( ) ) ;
out = " byte: \\ x " + res ;
}
2024-03-07 11:41:53 +02:00
2024-02-29 21:42:11 +01:00
return out ;
}
2024-09-15 20:46:12 +03:00
static bool server_sent_event ( httplib : : DataSink & sink , const char * event , const json & data ) {
2024-09-02 17:11:51 +02:00
const std : : string str =
std : : string ( event ) + " : " +
data . dump ( - 1 , ' ' , false , json : : error_handler_t : : replace ) +
2024-12-06 11:14:32 +01:00
" \n \n " ; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
2024-09-02 17:11:51 +02:00
2024-09-15 20:46:12 +03:00
LOG_DBG ( " data stream, to_send: %s " , str . c_str ( ) ) ;
2024-09-02 17:11:51 +02:00
return sink . write ( str . c_str ( ) , str . size ( ) ) ;
}
2024-03-07 11:41:53 +02:00
//
// OAI utils
//
2025-05-23 11:03:47 +02:00
// used by /completions endpoint
2024-12-31 12:34:13 +01:00
static json oaicompat_completion_params_parse ( const json & body ) {
json llama_params ;
if ( ! body . contains ( " prompt " ) ) {
throw std : : runtime_error ( " \" prompt \" is required " ) ;
}
// Handle "stop" field
if ( body . contains ( " stop " ) & & body . at ( " stop " ) . is_string ( ) ) {
llama_params [ " stop " ] = json : : array ( { body . at ( " stop " ) . get < std : : string > ( ) } ) ;
} else {
llama_params [ " stop " ] = json_value ( body , " stop " , json : : array ( ) ) ;
}
// Handle "n" field
int n_choices = json_value ( body , " n " , 1 ) ;
if ( n_choices ! = 1 ) {
throw std : : runtime_error ( " Only one completion choice is allowed " ) ;
}
2025-02-25 11:52:52 +00:00
// Handle "echo" field
if ( json_value ( body , " echo " , false ) ) {
throw std : : runtime_error ( " Only no echo is supported " ) ;
}
2024-12-31 12:34:13 +01:00
// Params supported by OAI but unsupported by llama.cpp
2025-02-25 11:52:52 +00:00
static const std : : vector < std : : string > unsupported_params { " best_of " , " suffix " } ;
2024-12-31 12:34:13 +01:00
for ( const auto & param : unsupported_params ) {
if ( body . contains ( param ) ) {
throw std : : runtime_error ( " Unsupported param: " + param ) ;
}
}
// Copy remaining properties to llama_params
for ( const auto & item : body . items ( ) ) {
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
if ( ! llama_params . contains ( item . key ( ) ) | | item . key ( ) = = " n_predict " ) {
llama_params [ item . key ( ) ] = item . value ( ) ;
}
}
return llama_params ;
}
2025-05-23 11:03:47 +02:00
struct oaicompat_parser_options {
bool use_jinja ;
bool prefill_assistant ;
common_reasoning_format reasoning_format ;
2025-06-29 20:02:53 +02:00
std : : map < std : : string , std : : string > chat_template_kwargs ;
2025-05-23 11:03:47 +02:00
common_chat_templates * tmpls ;
bool allow_image ;
bool allow_audio ;
2025-05-26 00:30:51 +01:00
bool enable_thinking = true ;
2025-05-23 11:03:47 +02:00
} ;
// used by /chat/completions endpoint
static json oaicompat_chat_params_parse (
2025-05-28 22:33:54 +08:00
json & body , /* openai api json semantics */
2025-05-23 11:03:47 +02:00
const oaicompat_parser_options & opt ,
2025-05-09 19:29:37 +02:00
std : : vector < raw_buffer > & out_files )
2025-01-21 13:18:51 +00:00
{
2024-03-07 11:41:53 +02:00
json llama_params ;
2025-01-21 13:18:51 +00:00
auto tools = json_value ( body , " tools " , json ( ) ) ;
2025-05-25 01:48:08 +01:00
auto has_tools = tools . is_array ( ) & & ! tools . empty ( ) ;
2025-01-30 19:13:58 +00:00
auto stream = json_value ( body , " stream " , false ) ;
2025-05-25 01:48:08 +01:00
auto tool_choice = json_value ( body , " tool_choice " , std : : string ( " auto " ) ) ;
2025-01-21 13:18:51 +00:00
2025-05-25 01:48:08 +01:00
if ( ! opt . use_jinja ) {
if ( has_tools ) {
2025-01-21 13:18:51 +00:00
throw std : : runtime_error ( " tools param requires --jinja flag " ) ;
}
2025-05-25 01:48:08 +01:00
if ( tool_choice ! = " auto " ) {
throw std : : runtime_error ( " tool_choice param requires --jinja flag " ) ;
2025-01-30 19:13:58 +00:00
}
}
json-schema-to-grammar improvements (+ added to server) (#5978)
* json: fix arrays (disallow `[,1]`)
* json: support tuple types (`[number, string]`)
* json: support additionalProperties (`{[k: string]: [string,number][]}`)
* json: support required / optional properties
* json: add support for pattern
* json: resolve $ref (and support https schema urls)
* json: fix $ref resolution
* join: support union types (mostly for nullable types I think)
* json: support allOf + nested anyOf
* json: support any (`{}` or `{type: object}`)
* json: fix merge
* json: temp fix for escapes
* json: spaces in output and unrestricted output spaces
* json: add typings
* json:fix typo
* Create ts-type-to-grammar.sh
* json: fix _format_literal (json.dumps already escapes quotes)
* json: merge lit sequences and handle negatives
{"type": "string", "pattern": "^({\"question\": \"[^\"]+\", \"response\": \"[^\"]+\"}\\n)+$"}
* json: handle pattern repetitions
* Update json-schema-to-grammar.mjs
* Create regex-to-grammar.py
* json: extract repeated regexp patterns to subrule
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* json: handle schema from pydantic Optional fields
* Update json-schema-to-grammar.py
* Update json-schema-to-grammar.py
* Update ts-type-to-grammar.sh
* Update ts-type-to-grammar.sh
* json: simplify nullable fields handling
* json: accept duplicate identical rules
* json: revert space to 1 at most
* json: reuse regexp pattern subrules
* json: handle uuid string format
* json: fix literal escapes
* json: add --allow-fetch
* json: simplify range escapes
* json: support negative ranges in patterns
* Delete commit.txt
* json: custom regex parser, adds dot support & JS-portable
* json: rm trailing spaces
* Update json-schema-to-grammar.mjs
* json: updated server & chat `( cd examples/server && ./deps.sh )`
* json: port fixes from mjs to python
* Update ts-type-to-grammar.sh
* json: support prefixItems alongside array items
* json: add date format + fix uuid
* json: add date, time, date-time formats
* json: preserve order of props from TS defs
* json: port schema converter to C++, wire in ./server
* json: nits
* Update json-schema-to-grammar.cpp
* Update json-schema-to-grammar.cpp
* Update json-schema-to-grammar.cpp
* json: fix mjs implementation + align outputs
* Update json-schema-to-grammar.mjs.hpp
* json: test C++, JS & Python versions
* json: nits + regen deps
* json: cleanup test
* json: revert from c++17 to 11
* json: nit fixes
* json: dirty include for test
* json: fix zig build
* json: pass static command to std::system in tests (fixed temp files)
* json: fix top-level $refs
* json: don't use c++20 designated initializers
* nit
* json: basic support for reserved names `{number:{number:{root:number}}}`
* Revamp test cmake to allow args (WORKING_DIRECTORY needed for JSON test)
* json: re-ran server deps.sh
* json: simplify test
* json: support mix of additional props & required/optional
* json: add tests for some expected failures
* json: fix type=const in c++, add failure expectations for non-str const&enum
* json: test (& simplify output of) empty schema
* json: check parsing in test + fix value & string refs
* json: add server tests for OAI JSON response_format
* json: test/fix top-level anyOf
* json: improve grammar parsing failures
* json: test/fix additional props corner cases
* json: fix string patterns (was missing quotes)
* json: ws nit
* json: fix json handling in server when there's no response_format
* json: catch schema conversion errors in server
* json: don't complain about unknown format type in server if unset
* json: cleaner build of test
* json: create examples/json-schema-pydantic-example.py
* json: fix date pattern
* json: move json.hpp & json-schema-to-grammar.{cpp,h} to common
* json: indent 4 spaces
* json: fix naming of top-level c++ function (+ drop unused one)
* json: avoid using namespace std
* json: fix zig build
* Update server.feature
* json: iostream -> fprintf
* json: space before & refs for consistency
* json: nits
2024-03-21 11:50:43 +00:00
2024-03-25 09:42:17 +01:00
// Handle "stop" field
2024-05-08 21:53:08 +02:00
if ( body . contains ( " stop " ) & & body . at ( " stop " ) . is_string ( ) ) {
llama_params [ " stop " ] = json : : array ( { body . at ( " stop " ) . get < std : : string > ( ) } ) ;
2024-03-07 11:41:53 +02:00
} else {
llama_params [ " stop " ] = json_value ( body , " stop " , json : : array ( ) ) ;
}
2025-02-18 18:03:23 +00:00
auto json_schema = json_value ( body , " json_schema " , json ( ) ) ;
auto grammar = json_value ( body , " grammar " , std : : string ( ) ) ;
if ( ! json_schema . is_null ( ) & & ! grammar . empty ( ) ) {
throw std : : runtime_error ( " Cannot use both json_schema and grammar " ) ;
}
2024-03-25 09:42:17 +01:00
// Handle "response_format" field
if ( body . contains ( " response_format " ) ) {
json response_format = json_value ( body , " response_format " , json : : object ( ) ) ;
std : : string response_type = json_value ( response_format , " type " , std : : string ( ) ) ;
if ( response_type = = " json_object " ) {
2025-02-18 18:03:23 +00:00
json_schema = json_value ( response_format , " schema " , json : : object ( ) ) ;
2024-09-18 01:50:34 -05:00
} else if ( response_type = = " json_schema " ) {
2025-03-04 06:24:07 +00:00
auto schema_wrapper = json_value ( response_format , " json_schema " , json : : object ( ) ) ;
json_schema = json_value ( schema_wrapper , " schema " , json : : object ( ) ) ;
2024-03-25 09:42:17 +01:00
} else if ( ! response_type . empty ( ) & & response_type ! = " text " ) {
throw std : : runtime_error ( " response_format type must be one of \" text \" or \" json_object \" , but got: " + response_type ) ;
}
}
2025-05-09 19:29:37 +02:00
// get input files
if ( ! body . contains ( " messages " ) ) {
throw std : : runtime_error ( " 'messages' is required " ) ;
}
2025-05-28 22:33:54 +08:00
json & messages = body . at ( " messages " ) ;
2025-05-09 19:29:37 +02:00
if ( ! messages . is_array ( ) ) {
throw std : : runtime_error ( " Expected 'messages' to be an array " ) ;
}
for ( auto & msg : messages ) {
2025-05-15 08:40:58 +02:00
std : : string role = json_value ( msg , " role " , std : : string ( ) ) ;
if ( role ! = " assistant " & & ! msg . contains ( " content " ) ) {
throw std : : runtime_error ( " All non-assistant messages must contain 'content' " ) ;
}
if ( role = = " assistant " ) {
if ( ! msg . contains ( " content " ) & & ! msg . contains ( " tool_calls " ) ) {
throw std : : runtime_error ( " Assistant message must contain either 'content' or 'tool_calls'! " ) ;
}
if ( ! msg . contains ( " content " ) ) {
continue ; // avoid errors with no content
}
}
2025-05-09 19:29:37 +02:00
json & content = msg . at ( " content " ) ;
2025-05-12 18:56:42 +07:00
if ( content . is_string ( ) | | content . is_null ( ) ) {
2025-05-09 19:29:37 +02:00
continue ;
}
if ( ! content . is_array ( ) ) {
throw std : : runtime_error ( " Expected 'content' to be a string or an array " ) ;
}
for ( auto & p : content ) {
std : : string type = json_value ( p , " type " , std : : string ( ) ) ;
if ( type = = " image_url " ) {
2025-05-23 11:03:47 +02:00
if ( ! opt . allow_image ) {
throw std : : runtime_error ( " image input is not supported - hint: if this is unexpected, you may need to provide the mmproj " ) ;
2025-05-09 19:29:37 +02:00
}
2025-05-23 11:03:47 +02:00
json image_url = json_value ( p , " image_url " , json : : object ( ) ) ;
2025-05-09 19:29:37 +02:00
std : : string url = json_value ( image_url , " url " , std : : string ( ) ) ;
if ( string_starts_with ( url , " http " ) ) {
// download remote image
// TODO @ngxson : maybe make these params configurable
common_remote_params params ;
params . headers . push_back ( " User-Agent: llama.cpp/ " + build_info ) ;
params . max_size = 1024 * 1024 * 10 ; // 10MB
params . timeout = 10 ; // seconds
SRV_INF ( " downloading image from '%s' \n " , url . c_str ( ) ) ;
auto res = common_remote_get_content ( url , params ) ;
if ( 200 < = res . first & & res . first < 300 ) {
SRV_INF ( " downloaded %ld bytes \n " , res . second . size ( ) ) ;
raw_buffer data ;
data . insert ( data . end ( ) , res . second . begin ( ) , res . second . end ( ) ) ;
out_files . push_back ( data ) ;
} else {
throw std : : runtime_error ( " Failed to download image " ) ;
}
} else {
// try to decode base64 image
std : : vector < std : : string > parts = string_split < std : : string > ( url , /*separator*/ ' , ' ) ;
if ( parts . size ( ) ! = 2 ) {
throw std : : runtime_error ( " Invalid image_url.url value " ) ;
} else if ( ! string_starts_with ( parts [ 0 ] , " data:image/ " ) ) {
throw std : : runtime_error ( " Invalid image_url.url format: " + parts [ 0 ] ) ;
} else if ( ! string_ends_with ( parts [ 0 ] , " base64 " ) ) {
throw std : : runtime_error ( " image_url.url must be base64 encoded " ) ;
} else {
auto base64_data = parts [ 1 ] ;
auto decoded_data = base64_decode ( base64_data ) ;
out_files . push_back ( decoded_data ) ;
}
}
// replace this chunk with a marker
p [ " type " ] = " text " ;
2025-05-22 20:42:48 +02:00
p [ " text " ] = mtmd_default_marker ( ) ;
2025-05-09 19:29:37 +02:00
p . erase ( " image_url " ) ;
2025-05-23 11:03:47 +02:00
} else if ( type = = " input_audio " ) {
if ( ! opt . allow_audio ) {
throw std : : runtime_error ( " audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj " ) ;
}
json input_audio = json_value ( p , " input_audio " , json : : object ( ) ) ;
std : : string data = json_value ( input_audio , " data " , std : : string ( ) ) ;
std : : string format = json_value ( input_audio , " format " , std : : string ( ) ) ;
// while we also support flac, we don't allow it here so we matches the OAI spec
if ( format ! = " wav " & & format ! = " mp3 " ) {
throw std : : runtime_error ( " input_audio.format must be either 'wav' or 'mp3' " ) ;
}
auto decoded_data = base64_decode ( data ) ; // expected to be base64 encoded
out_files . push_back ( decoded_data ) ;
// replace this chunk with a marker
p [ " type " ] = " text " ;
p [ " text " ] = mtmd_default_marker ( ) ;
p . erase ( " input_audio " ) ;
} else if ( type ! = " text " ) {
throw std : : runtime_error ( " unsupported content[].type " ) ;
2025-05-09 19:29:37 +02:00
}
}
}
2025-02-18 18:03:23 +00:00
common_chat_templates_inputs inputs ;
2025-05-09 19:29:37 +02:00
inputs . messages = common_chat_msgs_parse_oaicompat ( messages ) ;
2025-02-18 18:03:23 +00:00
inputs . tools = common_chat_tools_parse_oaicompat ( tools ) ;
2025-05-25 01:48:08 +01:00
inputs . tool_choice = common_chat_tool_choice_parse_oaicompat ( tool_choice ) ;
2025-02-18 18:03:23 +00:00
inputs . json_schema = json_schema . is_null ( ) ? " " : json_schema . dump ( ) ;
inputs . grammar = grammar ;
2025-05-23 11:03:47 +02:00
inputs . use_jinja = opt . use_jinja ;
2025-02-18 18:03:23 +00:00
inputs . parallel_tool_calls = json_value ( body , " parallel_tool_calls " , false ) ;
2025-05-25 10:45:49 +01:00
inputs . add_generation_prompt = json_value ( body , " add_generation_prompt " , true ) ;
2025-05-25 01:48:08 +01:00
inputs . reasoning_format = opt . reasoning_format ;
2025-05-26 00:30:51 +01:00
inputs . enable_thinking = opt . enable_thinking ;
2025-05-26 08:03:57 -07:00
if ( ! inputs . tools . empty ( ) & & inputs . tool_choice ! = COMMON_CHAT_TOOL_CHOICE_NONE ) {
if ( body . contains ( " grammar " ) ) {
throw std : : runtime_error ( " Cannot use custom grammar constraints with tools. " ) ;
}
llama_params [ " parse_tool_calls " ] = true ;
2025-02-18 18:03:23 +00:00
}
2025-06-29 20:02:53 +02:00
// merge the template args provided from command line with the args provided in the user request
auto chat_template_kwargs_object = json_value ( body , " chat_template_kwargs " , json : : object ( ) ) ;
inputs . chat_template_kwargs = opt . chat_template_kwargs ;
for ( const auto & item : chat_template_kwargs_object . items ( ) ) {
inputs . chat_template_kwargs [ item . key ( ) ] = item . value ( ) . dump ( ) ;
}
2025-09-05 14:31:24 -06:00
// parse the "enable_thinking" kwarg to override the default value
auto enable_thinking_kwarg = json_value ( inputs . chat_template_kwargs , " enable_thinking " , std : : string ( " " ) ) ;
if ( enable_thinking_kwarg = = " true " ) {
inputs . enable_thinking = true ;
} else if ( enable_thinking_kwarg = = " false " ) {
inputs . enable_thinking = false ;
} else if ( ! enable_thinking_kwarg . empty ( ) & & enable_thinking_kwarg [ 0 ] = = ' " ' ) {
throw std : : runtime_error ( " invalid type for \" enable_thinking \" (expected boolean, got string) " ) ;
}
2025-04-29 20:33:10 +02:00
// if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models
2025-05-23 11:03:47 +02:00
bool prefill_assistant_message = ! inputs . messages . empty ( ) & & inputs . messages . back ( ) . role = = " assistant " & & opt . prefill_assistant ;
2025-04-29 20:33:10 +02:00
common_chat_msg last_message ;
if ( prefill_assistant_message ) {
last_message = inputs . messages . back ( ) ;
inputs . messages . pop_back ( ) ;
/* sanity check, max one assistant message at the end of the list */
if ( ! inputs . messages . empty ( ) & & inputs . messages . back ( ) . role = = " assistant " ) {
throw std : : runtime_error ( " Cannot have 2 or more assistant messages at the end of the list. " ) ;
}
2025-05-25 01:48:08 +01:00
/* TODO: test this properly */
inputs . reasoning_format = COMMON_REASONING_FORMAT_NONE ;
2025-06-29 20:02:53 +02:00
2025-09-05 14:31:24 -06:00
if ( inputs . enable_thinking ) {
2025-06-29 20:02:53 +02:00
throw std : : runtime_error ( " Assistant response prefill is incompatible with enable_thinking. " ) ;
}
2025-04-29 20:33:10 +02:00
inputs . add_generation_prompt = true ;
}
2025-01-21 13:18:51 +00:00
// Apply chat template to the list of messages
2025-05-23 11:03:47 +02:00
auto chat_params = common_chat_templates_apply ( opt . tmpls , inputs ) ;
2025-02-18 18:03:23 +00:00
2025-04-29 20:33:10 +02:00
/* Append assistant prefilled message */
if ( prefill_assistant_message ) {
2025-07-05 09:17:14 +02:00
if ( ! last_message . content_parts . empty ( ) ) {
for ( auto & p : last_message . content_parts ) {
chat_params . prompt + = p . text ;
}
} else {
chat_params . prompt + = last_message . content ;
}
2025-04-29 20:33:10 +02:00
}
2025-02-18 18:03:23 +00:00
llama_params [ " chat_format " ] = static_cast < int > ( chat_params . format ) ;
llama_params [ " prompt " ] = chat_params . prompt ;
2025-03-14 11:21:17 +01:00
if ( ! chat_params . grammar . empty ( ) ) {
llama_params [ " grammar " ] = chat_params . grammar ;
}
2025-02-18 18:03:23 +00:00
llama_params [ " grammar_lazy " ] = chat_params . grammar_lazy ;
auto grammar_triggers = json : : array ( ) ;
for ( const auto & trigger : chat_params . grammar_triggers ) {
2025-04-02 09:58:34 +02:00
server_grammar_trigger ct ( trigger ) ;
grammar_triggers . push_back ( ct . to_json ( ) ) ;
2025-02-18 18:03:23 +00:00
}
llama_params [ " grammar_triggers " ] = grammar_triggers ;
llama_params [ " preserved_tokens " ] = chat_params . preserved_tokens ;
2025-05-25 01:48:08 +01:00
llama_params [ " thinking_forced_open " ] = chat_params . thinking_forced_open ;
2025-02-18 18:03:23 +00:00
for ( const auto & stop : chat_params . additional_stops ) {
llama_params [ " stop " ] . push_back ( stop ) ;
2025-01-21 13:18:51 +00:00
}
2024-03-25 09:42:17 +01:00
// Handle "n" field
int n_choices = json_value ( body , " n " , 1 ) ;
if ( n_choices ! = 1 ) {
throw std : : runtime_error ( " Only one completion choice is allowed " ) ;
}
// Handle "logprobs" field
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
2024-10-14 15:04:36 +08:00
if ( json_value ( body , " logprobs " , false ) ) {
2025-05-25 01:48:08 +01:00
if ( has_tools & & stream ) {
throw std : : runtime_error ( " logprobs is not supported with tools + stream " ) ;
}
2024-03-25 09:42:17 +01:00
llama_params [ " n_probs " ] = json_value ( body , " top_logprobs " , 20 ) ;
2024-10-14 15:04:36 +08:00
} else if ( body . contains ( " top_logprobs " ) & & ! body . at ( " top_logprobs " ) . is_null ( ) ) {
2024-03-25 09:42:17 +01:00
throw std : : runtime_error ( " top_logprobs requires logprobs to be set to true " ) ;
}
// Copy remaining properties to llama_params
2024-10-29 10:42:05 +02:00
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
2024-03-25 09:42:17 +01:00
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
for ( const auto & item : body . items ( ) ) {
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
if ( ! llama_params . contains ( item . key ( ) ) | | item . key ( ) = = " n_predict " ) {
llama_params [ item . key ( ) ] = item . value ( ) ;
}
}
2024-03-07 11:41:53 +02:00
return llama_params ;
}
2024-12-24 21:33:04 +01:00
static json format_embeddings_response_oaicompat ( const json & request , const json & embeddings , bool use_base64 = false ) {
2024-03-13 11:39:11 +01:00
json data = json : : array ( ) ;
2024-12-17 16:00:24 +00:00
int32_t n_tokens = 0 ;
2024-03-13 11:39:11 +01:00
int i = 0 ;
2024-09-15 20:46:12 +03:00
for ( const auto & elem : embeddings ) {
2024-12-24 21:33:04 +01:00
json embedding_obj ;
if ( use_base64 ) {
const auto & vec = json_value ( elem , " embedding " , json : : array ( ) ) . get < std : : vector < float > > ( ) ;
const char * data_ptr = reinterpret_cast < const char * > ( vec . data ( ) ) ;
size_t data_size = vec . size ( ) * sizeof ( float ) ;
embedding_obj = {
{ " embedding " , base64 : : encode ( data_ptr , data_size ) } ,
{ " index " , i + + } ,
{ " object " , " embedding " } ,
{ " encoding_format " , " base64 " }
} ;
} else {
embedding_obj = {
{ " embedding " , json_value ( elem , " embedding " , json : : array ( ) ) } ,
{ " index " , i + + } ,
{ " object " , " embedding " }
} ;
}
data . push_back ( embedding_obj ) ;
2024-12-17 16:00:24 +00:00
n_tokens + = json_value ( elem , " tokens_evaluated " , 0 ) ;
2024-03-13 11:39:11 +01:00
}
2024-03-07 11:41:53 +02:00
json res = json {
{ " model " , json_value ( request , " model " , std : : string ( DEFAULT_OAICOMPAT_MODEL ) ) } ,
{ " object " , " list " } ,
2024-12-17 16:00:24 +00:00
{ " usage " , json {
{ " prompt_tokens " , n_tokens } ,
{ " total_tokens " , n_tokens }
2024-03-07 11:41:53 +02:00
} } ,
2024-03-13 11:39:11 +01:00
{ " data " , data }
2024-03-07 11:41:53 +02:00
} ;
return res ;
}
2025-02-18 14:21:41 +01:00
static json format_response_rerank (
const json & request ,
const json & ranks ,
bool is_tei_format ,
std : : vector < std : : string > & texts ) {
json res ;
if ( is_tei_format ) {
// TEI response format
res = json : : array ( ) ;
bool return_text = json_value ( request , " return_text " , false ) ;
for ( const auto & rank : ranks ) {
int index = json_value ( rank , " index " , 0 ) ;
json elem = json {
{ " index " , index } ,
{ " score " , json_value ( rank , " score " , 0.0 ) } ,
} ;
if ( return_text ) {
elem [ " text " ] = std : : move ( texts [ index ] ) ;
}
res . push_back ( elem ) ;
}
} else {
// Jina response format
json results = json : : array ( ) ;
int32_t n_tokens = 0 ;
for ( const auto & rank : ranks ) {
results . push_back ( json {
{ " index " , json_value ( rank , " index " , 0 ) } ,
{ " relevance_score " , json_value ( rank , " score " , 0.0 ) } ,
} ) ;
2024-12-17 16:00:24 +00:00
2025-02-18 14:21:41 +01:00
n_tokens + = json_value ( rank , " tokens_evaluated " , 0 ) ;
}
2024-09-28 17:42:03 +03:00
2025-02-18 14:21:41 +01:00
res = json {
{ " model " , json_value ( request , " model " , std : : string ( DEFAULT_OAICOMPAT_MODEL ) ) } ,
{ " object " , " list " } ,
{ " usage " , json {
{ " prompt_tokens " , n_tokens } ,
{ " total_tokens " , n_tokens }
} } ,
{ " results " , results }
} ;
}
2024-09-28 17:42:03 +03:00
return res ;
}
2024-09-12 22:30:11 +02:00
static bool is_valid_utf8 ( const std : : string & str ) {
const unsigned char * bytes = reinterpret_cast < const unsigned char * > ( str . data ( ) ) ;
const unsigned char * end = bytes + str . length ( ) ;
while ( bytes < end ) {
if ( * bytes < = 0x7F ) {
// 1-byte sequence (0xxxxxxx)
bytes + + ;
} else if ( ( * bytes & 0xE0 ) = = 0xC0 ) {
// 2-byte sequence (110xxxxx 10xxxxxx)
if ( end - bytes < 2 | | ( bytes [ 1 ] & 0xC0 ) ! = 0x80 )
return false ;
bytes + = 2 ;
} else if ( ( * bytes & 0xF0 ) = = 0xE0 ) {
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
if ( end - bytes < 3 | | ( bytes [ 1 ] & 0xC0 ) ! = 0x80 | | ( bytes [ 2 ] & 0xC0 ) ! = 0x80 )
return false ;
bytes + = 3 ;
} else if ( ( * bytes & 0xF8 ) = = 0xF0 ) {
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if ( end - bytes < 4 | | ( bytes [ 1 ] & 0xC0 ) ! = 0x80 | |
( bytes [ 2 ] & 0xC0 ) ! = 0x80 | | ( bytes [ 3 ] & 0xC0 ) ! = 0x80 )
return false ;
bytes + = 4 ;
} else {
// Invalid UTF-8 lead byte
return false ;
}
}
return true ;
}
static json format_tokenizer_response ( const json & tokens ) {
2024-03-07 11:41:53 +02:00
return json {
{ " tokens " , tokens }
} ;
}
static json format_detokenized_response ( const std : : string & content ) {
return json {
{ " content " , content }
} ;
}
2024-12-07 20:21:09 +01:00
static json format_logit_bias ( const std : : vector < llama_logit_bias > & logit_bias ) {
json data = json : : array ( ) ;
for ( const auto & lb : logit_bias ) {
data . push_back ( json {
{ " bias " , lb . bias } ,
{ " token " , lb . token } ,
} ) ;
}
return data ;
}
2025-01-12 11:32:42 +02:00
static std : : string safe_json_to_str ( const json & data ) {
2024-12-07 20:21:09 +01:00
return data . dump ( - 1 , ' ' , false , json : : error_handler_t : : replace ) ;
}
2024-12-19 15:40:08 +01:00
static std : : vector < llama_token_data > get_token_probabilities ( llama_context * ctx , int idx ) {
std : : vector < llama_token_data > cur ;
const auto * logits = llama_get_logits_ith ( ctx , idx ) ;
2025-01-12 11:32:42 +02:00
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
const int n_vocab = llama_vocab_n_tokens ( vocab ) ;
2024-12-19 15:40:08 +01:00
cur . resize ( n_vocab ) ;
for ( llama_token token_id = 0 ; token_id < n_vocab ; token_id + + ) {
cur [ token_id ] = llama_token_data { token_id , logits [ token_id ] , 0.0f } ;
}
// sort tokens by logits
std : : sort ( cur . begin ( ) , cur . end ( ) , [ ] ( const llama_token_data & a , const llama_token_data & b ) {
return a . logit > b . logit ;
} ) ;
// apply softmax
float max_l = cur [ 0 ] . logit ;
float cum_sum = 0.0f ;
for ( size_t i = 0 ; i < cur . size ( ) ; + + i ) {
float p = expf ( cur [ i ] . logit - max_l ) ;
cur [ i ] . p = p ;
cum_sum + = p ;
}
for ( size_t i = 0 ; i < cur . size ( ) ; + + i ) {
cur [ i ] . p / = cum_sum ;
}
return cur ;
}
2025-01-02 15:05:18 +01:00
static bool are_lora_equal (
2025-01-12 11:32:42 +02:00
const std : : vector < common_adapter_lora_info > & l1 ,
const std : : vector < common_adapter_lora_info > & l2 ) {
2025-01-02 15:05:18 +01:00
if ( l1 . size ( ) ! = l2 . size ( ) ) {
return false ;
}
for ( size_t i = 0 ; i < l1 . size ( ) ; + + i ) {
// we don't check lora.path to reduce the time complexity
2025-01-03 10:18:53 +02:00
if ( l1 [ i ] . scale ! = l2 [ i ] . scale | | l1 [ i ] . ptr ! = l2 [ i ] . ptr ) {
2025-01-02 15:05:18 +01:00
return false ;
}
}
return true ;
}
2025-01-03 10:18:53 +02:00
// parse lora config from JSON request, returned a copy of lora_base with updated scale
2025-01-12 11:32:42 +02:00
static std : : vector < common_adapter_lora_info > parse_lora_request (
const std : : vector < common_adapter_lora_info > & lora_base ,
2025-01-02 15:05:18 +01:00
const json & data ) {
2025-01-12 11:32:42 +02:00
std : : vector < common_adapter_lora_info > lora ( lora_base ) ;
2025-01-02 15:05:18 +01:00
int max_idx = lora . size ( ) ;
// clear existing value
for ( auto & entry : lora ) {
entry . scale = 0.0f ;
}
// set value
for ( const auto & entry : data ) {
int id = json_value ( entry , " id " , - 1 ) ;
float scale = json_value ( entry , " scale " , 0.0f ) ;
if ( 0 < = id & & id < max_idx ) {
lora [ id ] . scale = scale ;
} else {
throw std : : runtime_error ( " invalid adapter id " ) ;
}
}
return lora ;
}
2025-05-09 19:29:37 +02:00
//
// utils for interacting with libmtmd
// (may need to refactor in near future)
//
/**
* server_tokens is a helper to manage the input tokens and image for the server .
* it is made this way to simplify the logic of KV cache management .
*/
struct server_tokens {
bool has_mtmd = false ;
private : // disallow accessing these members directly, risking out-of-sync
// map a **start** position in tokens to the image chunk
2025-05-23 11:03:47 +02:00
std : : unordered_map < llama_pos , mtmd : : input_chunk_ptr > map_pos_to_media ;
2025-05-09 19:29:37 +02:00
// list of tokens
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
llama_tokens tokens ;
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// pos 0 1 2 3 4 5 6 7 8 9
2025-05-23 11:03:47 +02:00
// map_pos_to_media will contain: {5, img0}, {8, img1}
2025-05-09 19:29:37 +02:00
public :
server_tokens ( ) = default ;
~ server_tokens ( ) = default ;
// Prevent copying
server_tokens ( const server_tokens & ) = delete ;
server_tokens & operator = ( const server_tokens & ) = delete ;
// Allow moving (usually implicitly generated if members are movable)
server_tokens ( server_tokens & & ) = default ;
server_tokens & operator = ( server_tokens & & ) = default ;
// Allow accessing elements using [] operator
llama_token operator [ ] ( size_t index ) { return tokens [ index ] ; }
const llama_token & operator [ ] ( size_t index ) const { return tokens [ index ] ; }
server_tokens ( mtmd : : input_chunks & mtmd_chunks , bool has_mtmd ) : has_mtmd ( has_mtmd ) {
for ( size_t i = 0 ; i < mtmd_chunks . size ( ) ; + + i ) {
push_back ( mtmd_chunks [ i ] ) ;
}
}
server_tokens ( llama_tokens & tokens , bool has_mtmd ) : has_mtmd ( has_mtmd ) , tokens ( tokens ) { }
// for debugging
std : : string str ( ) const {
std : : ostringstream oss ;
oss < < " tokens: " ;
for ( const auto & t : tokens ) {
if ( t = = LLAMA_TOKEN_NULL ) {
oss < < " <embd> " ;
} else {
oss < < t < < " " ;
}
}
oss < < " \n " ;
oss < < " image pos: " ;
2025-05-23 11:03:47 +02:00
for ( const auto & it : map_pos_to_media ) {
2025-05-09 19:29:37 +02:00
oss < < it . first < < " , " ;
}
return oss . str ( ) ;
}
const mtmd : : input_chunk_ptr & find_chunk ( llama_pos pos ) const {
2025-05-23 11:03:47 +02:00
auto it = map_pos_to_media . find ( pos ) ;
if ( it ! = map_pos_to_media . end ( ) ) {
2025-05-09 19:29:37 +02:00
return it - > second ;
} else {
throw std : : runtime_error ( " Chunk not found " ) ;
}
}
void push_back ( llama_token tok ) {
if ( tok = = LLAMA_TOKEN_NULL ) {
throw std : : runtime_error ( " Invalid token " ) ;
}
tokens . emplace_back ( tok ) ;
}
// will create a copy of the chunk if it contains non-text data
void push_back ( const mtmd_input_chunk * chunk ) {
auto type = mtmd_input_chunk_get_type ( chunk ) ;
2025-05-23 11:03:47 +02:00
if ( type = = MTMD_INPUT_CHUNK_TYPE_IMAGE | | type = = MTMD_INPUT_CHUNK_TYPE_AUDIO ) {
2025-05-09 19:29:37 +02:00
GGML_ASSERT ( has_mtmd ) ;
2025-05-23 11:03:47 +02:00
const int n_pos = mtmd_input_chunk_get_n_pos ( chunk ) ;
2025-05-09 19:29:37 +02:00
llama_pos start_pos = tokens . size ( ) ;
for ( int i = 0 ; i < n_pos ; + + i ) {
tokens . emplace_back ( LLAMA_TOKEN_NULL ) ;
}
mtmd : : input_chunk_ptr new_chunk ( mtmd_input_chunk_copy ( chunk ) ) ;
2025-05-23 11:03:47 +02:00
map_pos_to_media [ start_pos ] = std : : move ( new_chunk ) ;
2025-05-09 19:29:37 +02:00
} else if ( type = = MTMD_INPUT_CHUNK_TYPE_TEXT ) {
size_t n_tokens ;
auto text_tokens = mtmd_input_chunk_get_tokens_text ( chunk , & n_tokens ) ;
for ( size_t i = 0 ; i < n_tokens ; + + i ) {
push_back ( text_tokens [ i ] ) ;
}
} else {
GGML_ABORT ( " Invalid chunk type " ) ;
}
}
2025-08-22 08:10:14 +00:00
// appends server tokens, updates the media map. copies media chunks.
void push_back ( server_tokens & tokens ) {
size_t start_pos = size ( ) ;
for ( size_t i = 0 ; i < tokens . size ( ) ; i + + ) {
push_back ( tokens [ i ] ) ;
}
if ( tokens . has_mtmd ) {
// Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
// We could also just check, but this will prevent silently dropping MTMD data.
GGML_ASSERT ( has_mtmd ) ;
for ( auto it = tokens . map_pos_to_media . begin ( ) ; it ! = tokens . map_pos_to_media . end ( ) ; ) {
auto chunk = tokens . map_pos_to_media [ it - > first ] . get ( ) ;
mtmd : : input_chunk_ptr new_chunk ( mtmd_input_chunk_copy ( chunk ) ) ;
map_pos_to_media [ start_pos + it - > first ] = std : : move ( new_chunk ) ;
}
}
}
2025-05-09 19:29:37 +02:00
// for compatibility with context shift and prompt truncation
void insert ( const llama_tokens & inp_tokens ) {
GGML_ASSERT ( ! has_mtmd ) ; // only allow this if mtmd is disabled
tokens . insert ( tokens . end ( ) , inp_tokens . begin ( ) , inp_tokens . end ( ) ) ;
}
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens ( ) const {
GGML_ASSERT ( ! has_mtmd ) ; // only allow this if mtmd is disabled
return tokens ;
}
// for compatibility with speculative decoding
void set_token ( llama_pos pos , llama_token id ) {
GGML_ASSERT ( ! has_mtmd ) ; // only allow this if mtmd is disabled
tokens [ pos ] = id ;
}
size_t size ( ) const {
return tokens . size ( ) ;
}
bool empty ( ) const {
return tokens . empty ( ) ;
}
void clear ( ) {
tokens . clear ( ) ;
}
2025-05-14 13:35:07 +02:00
void keep_first ( size_t n ) {
2025-05-09 19:29:37 +02:00
GGML_ASSERT ( n < = tokens . size ( ) ) ;
if ( has_mtmd ) {
2025-05-23 11:03:47 +02:00
if ( n = = tokens . size ( ) ) {
return ; // nothing to do
}
2025-05-09 19:29:37 +02:00
// we throw an error if we try to remove a token in the middle of an image
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// n 1 2 3 4 5 6 7 8 9 10
// allowed to resize ^ ^
// disallowed to resize ^ ^ ^
if ( n > 0 ) {
llama_token last_token = tokens [ n - 1 ] ;
// make sure we never remove tokens in the middle of an image
if ( last_token = = LLAMA_TOKEN_NULL ) {
find_chunk ( n - 1 ) ; // will throw an error if the token is not begin-of-chunk
}
}
// remove all image chunks that are not used anymore
2025-05-23 11:03:47 +02:00
for ( auto it = map_pos_to_media . begin ( ) ; it ! = map_pos_to_media . end ( ) ; ) {
2025-05-09 19:29:37 +02:00
llama_pos pos = it - > first ;
if ( pos > = ( llama_pos ) n ) {
2025-05-23 11:03:47 +02:00
it = map_pos_to_media . erase ( it ) ;
2025-05-09 19:29:37 +02:00
} else {
+ + it ;
}
}
}
tokens . resize ( n ) ;
}
std : : string detokenize ( const llama_context * ctx , bool special ) const {
llama_tokens text_tokens ;
text_tokens . reserve ( tokens . size ( ) ) ;
for ( const auto & t : tokens ) {
if ( t ! = LLAMA_TOKEN_NULL ) {
text_tokens . push_back ( t ) ;
}
}
return common_detokenize ( ctx , text_tokens , special ) ;
}
size_t get_common_prefix ( const server_tokens & b ) const {
size_t max_idx = std : : min ( tokens . size ( ) , b . tokens . size ( ) ) ;
for ( size_t i = 0 ; i < max_idx ; + + i ) {
auto & ai = tokens [ i ] ;
auto & bi = b . tokens [ i ] ;
if ( ai = = LLAMA_TOKEN_NULL & & bi = = LLAMA_TOKEN_NULL ) {
GGML_ASSERT ( has_mtmd ) ;
const auto & a_chunk = find_chunk ( i ) ;
const auto & b_chunk = b . find_chunk ( i ) ;
GGML_ASSERT ( a_chunk & & b_chunk ) ;
2025-05-23 11:03:47 +02:00
std : : string ai_id = mtmd_input_chunk_get_id ( a_chunk . get ( ) ) ;
std : : string bi_id = mtmd_input_chunk_get_id ( b_chunk . get ( ) ) ;
size_t a_pos = mtmd_input_chunk_get_n_pos ( a_chunk . get ( ) ) ;
size_t b_pos = mtmd_input_chunk_get_n_pos ( b_chunk . get ( ) ) ;
2025-05-09 19:29:37 +02:00
if ( ai_id = = bi_id & & a_pos = = b_pos ) {
2025-05-23 11:03:47 +02:00
GGML_ASSERT ( a_pos > 0 & & " Invalid media chunk " ) ; // should never happen
2025-05-09 19:29:37 +02:00
i + = a_pos - 1 ; // will be +1 by the for loop
continue ;
} else {
return i ;
}
} else if ( ai = = bi ) {
continue ;
} else {
return i ;
}
}
return max_idx ; // all tokens are equal
}
// make sure all text tokens are within the vocab range
bool validate ( const struct llama_context * ctx ) const {
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
const int32_t n_vocab = llama_vocab_n_tokens ( vocab ) ;
for ( size_t i = 0 ; i < tokens . size ( ) ; + + i ) {
auto & t = tokens [ i ] ;
if ( t = = LLAMA_TOKEN_NULL ) {
try {
const auto & chunk = find_chunk ( i ) ;
2025-05-23 11:03:47 +02:00
size_t n_pos = mtmd_input_chunk_get_n_pos ( chunk . get ( ) ) ;
2025-05-09 19:29:37 +02:00
i + = n_pos - 1 ; // will be +1 by the for loop
} catch ( const std : : exception & e ) {
return false ;
}
} else if ( t < 0 | | t > = n_vocab ) {
return false ;
}
}
return true ;
}
// encode and decode the image chunk
int32_t process_chunk (
llama_context * ctx ,
mtmd_context * mctx ,
llama_pos n_past ,
int32_t seq_id ,
llama_pos & n_pos_out ) {
2025-05-23 11:03:47 +02:00
auto & chunk = find_chunk ( n_past ) ;
const char * name = mtmd_input_chunk_get_type ( chunk . get ( ) ) = = MTMD_INPUT_CHUNK_TYPE_IMAGE
? " image " : " audio " ;
SRV_INF ( " processing %s... \n " , name ) ;
2025-05-09 19:29:37 +02:00
int32_t n_batch = llama_n_batch ( ctx ) ;
int64_t t0 = ggml_time_ms ( ) ;
llama_pos new_n_past = n_past ;
int32_t result = mtmd_helper_eval_chunk_single ( mctx , ctx ,
2025-05-23 11:03:47 +02:00
chunk . get ( ) ,
2025-05-09 19:29:37 +02:00
n_past ,
seq_id ,
n_batch ,
true , // logits last
& new_n_past ) ;
2025-05-23 11:03:47 +02:00
SRV_INF ( " %s processed in % " PRId64 " ms \n " , name , ggml_time_ms ( ) - t0 ) ;
2025-05-09 19:29:37 +02:00
if ( result ! = 0 ) {
LOG_ERR ( " mtmd_helper_eval failed with status %d " , result ) ;
n_pos_out = n_past ;
return result ;
}
n_pos_out = new_n_past ;
return 0 ;
}
} ;
// Computes FNV-1a hash of the data
static std : : string fnv_hash ( const uint8_t * data , size_t len ) {
const uint64_t fnv_prime = 0x100000001b3ULL ;
uint64_t hash = 0xcbf29ce484222325ULL ;
for ( size_t i = 0 ; i < len ; + + i ) {
hash ^ = data [ i ] ;
hash * = fnv_prime ;
}
return std : : to_string ( hash ) ;
}
2025-08-22 08:10:14 +00:00
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
static server_tokens format_rerank ( const struct llama_vocab * vocab , server_tokens & query , server_tokens & doc ) {
server_tokens result = { } ;
// Get EOS token - use SEP token as fallback if EOS is not available
llama_token eos_token = llama_vocab_eos ( vocab ) ;
if ( eos_token = = LLAMA_TOKEN_NULL ) {
eos_token = llama_vocab_sep ( vocab ) ;
}
if ( llama_vocab_get_add_bos ( vocab ) ) {
result . push_back ( llama_vocab_bos ( vocab ) ) ;
}
result . push_back ( query ) ;
if ( llama_vocab_get_add_eos ( vocab ) ) {
result . push_back ( eos_token ) ;
}
if ( llama_vocab_get_add_sep ( vocab ) ) {
result . push_back ( llama_vocab_sep ( vocab ) ) ;
}
result . push_back ( doc ) ;
if ( llama_vocab_get_add_eos ( vocab ) ) {
result . push_back ( eos_token ) ;
}
return result ;
}
static server_tokens process_mtmd_prompt ( mtmd_context * mctx , std : : string prompt , std : : vector < raw_buffer > files ) {
mtmd : : bitmaps bitmaps ;
for ( auto & file : files ) {
mtmd : : bitmap bmp ( mtmd_helper_bitmap_init_from_buf ( mctx , file . data ( ) , file . size ( ) ) ) ;
if ( ! bmp . ptr ) {
throw std : : runtime_error ( " Failed to load image or audio file " ) ;
}
// calculate bitmap hash (for KV caching)
std : : string hash = fnv_hash ( bmp . data ( ) , bmp . n_bytes ( ) ) ;
bmp . set_id ( hash . c_str ( ) ) ;
bitmaps . entries . push_back ( std : : move ( bmp ) ) ;
}
// process prompt
std : : vector < server_tokens > inputs ;
// multimodal
mtmd_input_text inp_txt = {
prompt . c_str ( ) ,
/* add_special */ true ,
/* parse_special */ true ,
} ;
mtmd : : input_chunks chunks ( mtmd_input_chunks_init ( ) ) ;
auto bitmaps_c_ptr = bitmaps . c_ptr ( ) ;
int32_t tokenized = mtmd_tokenize ( mctx ,
chunks . ptr . get ( ) ,
& inp_txt ,
bitmaps_c_ptr . data ( ) ,
bitmaps_c_ptr . size ( ) ) ;
if ( tokenized ! = 0 ) {
throw std : : runtime_error ( " Failed to tokenize prompt " ) ;
}
auto result = server_tokens ( chunks , true ) ;
return result ;
}
/**
* break the input " prompt " object into multiple prompt if needed , then tokenize them
* use tokenize_input_prompts ( ) if the input could be an array .
* this supports these cases :
* - " prompt " : " string "
* - " prompt " : [ 12 , 34 , 56 ]
* - " prompt " : [ 12 , 34 , " string " , 56 , 78 ]
* - " prompt " : { " prompt_string " : " string " , " multimodal_data " : [ " base64 " ] }
*/
static server_tokens tokenize_input_subprompt ( const llama_vocab * vocab , mtmd_context * mctx , const json & json_prompt , bool add_special , bool parse_special ) {
constexpr char JSON_STRING_PROMPT_KEY [ ] = " prompt_string " ;
constexpr char JSON_MTMD_DATA_KEY [ ] = " multimodal_data " ;
const bool has_mtmd = mctx ! = nullptr ;
if ( json_prompt . is_string ( ) | | json_is_array_of_mixed_numbers_strings ( json_prompt ) ) {
// string or mixed
llama_tokens tmp = tokenize_mixed ( vocab , json_prompt , add_special , parse_special ) ;
return server_tokens ( tmp , false ) ;
} else if ( json_is_array_of_numbers ( json_prompt ) ) {
// array of tokens
llama_tokens tmp = json_prompt . get < llama_tokens > ( ) ;
return server_tokens ( tmp , false ) ;
} else if ( json_prompt . contains ( JSON_STRING_PROMPT_KEY ) ) {
// JSON object with prompt key.
if ( json_prompt . contains ( JSON_MTMD_DATA_KEY ) ) {
if ( ! has_mtmd )
throw std : : runtime_error ( " Multimodal data provided, but model does not support multimodal requests. " ) ;
// JSON object with prompt and multimodal key.
std : : vector < raw_buffer > files ;
for ( const auto & entry : json_prompt . at ( JSON_MTMD_DATA_KEY ) ) {
files . push_back ( base64_decode ( entry ) ) ;
}
return process_mtmd_prompt ( mctx , json_prompt . at ( JSON_STRING_PROMPT_KEY ) , files ) ;
} else {
// Not multimodal, but contains a subobject.
llama_tokens tmp = tokenize_mixed ( vocab , json_prompt . at ( JSON_STRING_PROMPT_KEY ) , add_special , parse_special ) ;
return server_tokens ( tmp , false ) ;
}
} else {
throw std : : runtime_error ( " \" prompt \" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens. " ) ;
}
}
/**
* break the input " prompt " object into multiple prompt if needed , then tokenize them
* this supports these cases :
* - " prompt " : " string "
* - " prompt " : [ 12 , 34 , 56 ]
* - " prompt " : [ 12 , 34 , " string " , 56 , 78 ]
* - " prompt " : { " prompt_string " : " string " , " multimodal_data " : [ " base64 " ] }
* and multiple prompts ( multi - tasks ) :
* - " prompt " : [ " string1 " , " string2 " ]
* - " prompt " : [ " string1 " , [ 12 , 34 , 56 ] ]
* - " prompt " : [[12, 34, 56], [78, 90, 12]]
* - " prompt " : [ [ 12 , 34 , " string " , 56 , 78 ] , [ 12 , 34 , 56 ] , { " prompt_string " : " string " , " multimodal_data " : [ " base64 " ] } ]
*/
static std : : vector < server_tokens > tokenize_input_prompts ( const llama_vocab * vocab , mtmd_context * mctx , const json & json_prompt , bool add_special , bool parse_special ) {
std : : vector < server_tokens > result ;
if ( json_prompt . is_array ( ) & & ! json_is_array_and_contains_numbers ( json_prompt ) ) {
result . reserve ( json_prompt . size ( ) ) ;
for ( const auto & p : json_prompt ) {
result . push_back ( tokenize_input_subprompt ( vocab , mctx , p , add_special , parse_special ) ) ;
}
} else {
result . push_back ( tokenize_input_subprompt ( vocab , mctx , json_prompt , add_special , parse_special ) ) ;
}
if ( result . empty ( ) ) {
throw std : : runtime_error ( " \" prompt \" must not be empty " ) ;
}
return result ;
}