2024-07-23 13:10:17 +03:00
# include "llama-impl.h"
2025-01-03 10:18:53 +02:00
# include "llama-chat.h"
# include "llama-mmap.h"
# include "llama-context.h"
2024-07-23 13:10:17 +03:00
# include "llama-vocab.h"
# include "llama-sampling.h"
2025-01-03 10:18:53 +02:00
# include "llama-kv-cache.h"
# include "llama-model-loader.h"
# include "llama-model.h"
2023-10-03 09:16:26 +02:00
2023-03-22 07:32:36 +02:00
# include "ggml.h"
2023-08-22 15:25:19 +02:00
# include "ggml-alloc.h"
2023-12-21 21:07:46 +01:00
# include "ggml-backend.h"
2024-11-01 23:48:26 +01:00
# include "ggml-cpp.h"
2023-08-21 23:07:43 +03:00
# include <algorithm>
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't
include the hack needed to support GPT4All files without conversion.
Those can still be used after converting them with convert.py from my
other PR.)
- Support both mmap and read (mmap is used by default, but can be
disabled with `--no-mmap`, and is automatically disabled for pre-ggjt
files or on platforms where mmap is not supported).
- Support multi-file models like before, but automatically determine the
number of parts rather than requiring `--n_parts`.
- Improve validation and error checking.
- Stop using the per-file type field (f16) entirely in favor of just
relying on the per-tensor type/size fields. This has no immediate
benefit, but makes it easier to experiment with different formats, and
should make it easier to support the new GPTQ-for-LLaMa models in the
future (I have some work in progress on that front).
- Support VirtualLock on Windows (using the same `--mlock` option as on
Unix).
- Indicate loading progress when using mmap + mlock. (Which led me
to the interesting observation that on my Linux machine, with a
warm file cache, mlock actually takes some time, whereas mmap
without mlock starts almost instantly...)
- To help implement this, move mlock support from ggml to the
loading code.
- madvise/PrefetchVirtualMemory support (based on #740)
- Switch from ifstream to the `fopen` family of functions to avoid
unnecessary copying and, when mmap is enabled, allow reusing the same
file descriptor for both metadata reads and mmap (whereas the existing
implementation opens the file a second time to mmap).
- Quantization now produces a single-file output even with multi-file
inputs (not really a feature as much as 'it was easier this way').
Implementation notes:
I tried to factor the code into more discrete pieces than before.
Regarding code style: I tried to follow the code style, but I'm naughty
and used a few advanced C++ features repeatedly:
- Destructors to make it easier to ensure everything gets cleaned up.
- Exceptions. I don't even usually use exceptions when writing C++, and
I can remove them if desired... but here they make the loading code
much more succinct while still properly handling a variety of errors,
ranging from API calls failing to integer overflow and allocation
failure. The exceptions are converted to error codes at the
API boundary.)
Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740)
2023-04-08 12:24:37 -07:00
# include <array>
2023-08-21 23:07:43 +03:00
# include <cassert>
2024-01-28 09:59:49 +01:00
# include <cfloat>
2023-11-01 18:04:33 -04:00
# include <cmath>
2023-08-26 14:17:51 -04:00
# include <cstddef>
# include <cstdint>
# include <cstdio>
2023-08-21 23:07:43 +03:00
# include <cstring>
# include <ctime>
2023-11-01 08:04:02 +02:00
# include <functional>
2023-03-29 13:51:37 -07:00
2023-06-16 21:23:53 +03:00
# if defined(_MSC_VER)
# pragma warning(disable: 4244 4267) // possible loss of data
# endif
2025-01-03 10:18:53 +02:00
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
2025-01-16 13:54:08 +01:00
static int llama_model_load ( const std : : string & fname , std : : vector < std : : string > & splits , llama_model & model , llama_model_params & params ) {
2025-01-12 11:32:42 +02:00
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model . t_load_us = 0 ;
time_meas tm ( model . t_load_us ) ;
model . t_start_us = tm . t_start_us ;
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
try {
2025-01-16 13:54:08 +01:00
llama_model_loader ml ( fname , splits , params . use_mmap , params . check_tensors , params . kv_overrides ) ;
2023-12-13 13:04:25 +01:00
2025-01-12 11:32:42 +02:00
ml . print_info ( ) ;
2025-01-03 10:18:53 +02:00
model . hparams . vocab_only = params . vocab_only ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
try {
2025-01-12 11:32:42 +02:00
model . load_arch ( ml ) ;
2025-01-03 10:18:53 +02:00
} catch ( const std : : exception & e ) {
throw std : : runtime_error ( " error loading model architecture: " + std : : string ( e . what ( ) ) ) ;
}
try {
2025-01-12 11:32:42 +02:00
model . load_hparams ( ml ) ;
2025-01-03 10:18:53 +02:00
} catch ( const std : : exception & e ) {
throw std : : runtime_error ( " error loading model hyperparameters: " + std : : string ( e . what ( ) ) ) ;
}
try {
2025-01-12 11:32:42 +02:00
model . load_vocab ( ml ) ;
2025-01-03 10:18:53 +02:00
} catch ( const std : : exception & e ) {
throw std : : runtime_error ( " error loading model vocabulary: " + std : : string ( e . what ( ) ) ) ;
}
2024-12-18 19:27:21 +02:00
2025-01-12 11:32:42 +02:00
model . load_stats ( ml ) ;
model . print_info ( ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
if ( params . vocab_only ) {
LLAMA_LOG_INFO ( " %s: vocab only - skipping tensors \n " , __func__ ) ;
return 0 ;
}
2024-07-05 05:14:21 +12:00
2025-01-12 11:32:42 +02:00
if ( ! model . load_tensors ( ml ) ) {
2025-01-03 10:18:53 +02:00
return - 2 ;
}
} catch ( const std : : exception & err ) {
LLAMA_LOG_ERROR ( " %s: error loading model: %s \n " , __func__ , err . what ( ) ) ;
return - 1 ;
}
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
return 0 ;
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
//
// llm_build
//
2023-11-01 18:04:33 -04:00
2025-01-03 10:18:53 +02:00
using llm_build_cb = std : : function < void ( struct ggml_tensor * cur , const char * name , int nl ) > ;
2023-11-01 18:04:33 -04:00
2025-01-03 10:18:53 +02:00
enum llm_ffn_op_type {
LLM_FFN_SILU ,
LLM_FFN_GELU ,
LLM_FFN_RELU ,
LLM_FFN_RELU_SQR ,
LLM_FFN_SWIGLU ,
} ;
2023-08-24 20:04:05 +02:00
2025-01-03 10:18:53 +02:00
enum llm_ffn_gate_type {
LLM_FFN_SEQ ,
LLM_FFN_PAR , // ffn_gate is parallel to ffn_up
} ;
2023-11-01 18:04:33 -04:00
2025-01-03 10:18:53 +02:00
enum llm_norm_type {
LLM_NORM ,
LLM_NORM_RMS ,
LLM_NORM_GROUP ,
} ;
2023-03-24 23:17:37 +02:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_inp_embd (
struct ggml_context * ctx ,
struct llama_context & lctx ,
const llama_hparams & hparams ,
2025-01-06 10:28:17 +01:00
const llama_ubatch & ubatch ,
2025-01-03 10:18:53 +02:00
struct ggml_tensor * tok_embd ,
const llm_build_cb & cb ) {
const int64_t n_embd = hparams . n_embd ;
2024-05-22 04:28:32 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpL ;
2023-03-24 23:17:37 +02:00
2025-01-06 10:28:17 +01:00
if ( ubatch . token ) {
lctx . inp_tokens = ggml_new_tensor_1d ( ctx , GGML_TYPE_I32 , ubatch . n_tokens ) ;
2025-01-03 10:18:53 +02:00
cb ( lctx . inp_tokens , " inp_tokens " , - 1 ) ;
ggml_set_input ( lctx . inp_tokens ) ;
2024-01-02 03:51:28 -08:00
2025-01-03 10:18:53 +02:00
inpL = ggml_get_rows ( ctx , tok_embd , lctx . inp_tokens ) ;
2025-01-08 15:59:53 +01:00
// apply lora for embedding tokens if needed
2025-01-12 11:32:42 +02:00
for ( auto & it : lctx . lora ) {
struct llama_adapter_lora_weight * lw = it . first - > get_weight ( tok_embd ) ;
if ( lw = = nullptr ) {
2025-01-08 15:59:53 +01:00
continue ;
}
const float adapter_scale = it . second ;
2025-01-12 11:32:42 +02:00
const float scale = lw - > get_scale ( it . first - > alpha , adapter_scale ) ;
2025-01-08 15:59:53 +01:00
struct ggml_tensor * inpL_delta = ggml_scale ( ctx , ggml_mul_mat (
2025-01-12 11:32:42 +02:00
ctx , lw - > b , // non-transposed lora_b
ggml_get_rows ( ctx , lw - > a , lctx . inp_tokens )
2025-01-08 15:59:53 +01:00
) , scale ) ;
inpL = ggml_add ( ctx , inpL , inpL_delta ) ;
}
2024-07-05 05:14:21 +12:00
} else {
2025-01-06 10:28:17 +01:00
lctx . inp_embd = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , n_embd , ubatch . n_tokens ) ;
2025-01-03 10:18:53 +02:00
inpL = lctx . inp_embd ;
ggml_set_input ( lctx . inp_embd ) ;
2024-07-05 05:14:21 +12:00
}
2024-01-02 03:51:28 -08:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_embedding_scale ! = 0.0f ) {
inpL = ggml_scale ( ctx , inpL , hparams . f_embedding_scale ) ;
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
cb ( inpL , " inp_embd " , - 1 ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
return inpL ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
static void llm_build_kv_store (
struct ggml_context * ctx ,
const llama_hparams & hparams ,
const llama_cparams & cparams ,
const llama_kv_cache & kv ,
struct ggml_cgraph * graph ,
struct ggml_tensor * k_cur ,
struct ggml_tensor * v_cur ,
int32_t n_tokens ,
int32_t kv_head ,
const llm_build_cb & cb ,
int64_t il ) {
const int64_t n_ctx = cparams . n_ctx ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_k_gqa = hparams . n_embd_k_gqa ( il ) ;
const int64_t n_embd_v_gqa = hparams . n_embd_v_gqa ( il ) ;
2024-02-17 23:04:16 +02:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( kv . size = = n_ctx ) ;
2024-02-17 23:04:16 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * k_cache_view = ggml_view_1d ( ctx , kv . k_l [ il ] , n_tokens * n_embd_k_gqa , ggml_row_size ( kv . k_l [ il ] - > type , n_embd_k_gqa ) * kv_head ) ;
cb ( k_cache_view , " k_cache_view " , il ) ;
2024-02-13 12:03:53 -05:00
2025-01-03 10:18:53 +02:00
// note: storing RoPE-ed version of K in the KV cache
ggml_build_forward_expand ( graph , ggml_cpy ( ctx , k_cur , k_cache_view ) ) ;
2024-02-17 23:04:16 +02:00
2025-01-03 10:18:53 +02:00
assert ( v_cur - > ne [ 0 ] = = n_embd_v_gqa & & v_cur - > ne [ 1 ] = = n_tokens ) ;
2023-12-05 10:19:18 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * v_cache_view = nullptr ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
if ( cparams . flash_attn ) {
v_cache_view = ggml_view_1d ( ctx , kv . v_l [ il ] , n_tokens * n_embd_v_gqa , ggml_row_size ( kv . v_l [ il ] - > type , n_embd_v_gqa ) * kv_head ) ;
} else {
// note: the V cache is transposed when not using flash attention
v_cache_view = ggml_view_2d ( ctx , kv . v_l [ il ] , n_tokens , n_embd_v_gqa ,
( n_ctx ) * ggml_element_size ( kv . v_l [ il ] ) ,
( kv_head ) * ggml_element_size ( kv . v_l [ il ] ) ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
v_cur = ggml_transpose ( ctx , v_cur ) ;
}
cb ( v_cache_view , " v_cache_view " , il ) ;
2024-08-10 13:04:40 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( graph , ggml_cpy ( ctx , v_cur , v_cache_view ) ) ;
}
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
// do mat_mul, while optionally apply lora
static struct ggml_tensor * llm_build_lora_mm (
struct llama_context & lctx ,
struct ggml_context * ctx0 ,
struct ggml_tensor * w ,
struct ggml_tensor * cur ) {
struct ggml_tensor * res = ggml_mul_mat ( ctx0 , w , cur ) ;
2025-01-12 11:32:42 +02:00
for ( auto & it : lctx . lora ) {
struct llama_adapter_lora_weight * lw = it . first - > get_weight ( w ) ;
if ( lw = = nullptr ) {
2025-01-03 10:18:53 +02:00
continue ;
}
2025-01-08 15:59:53 +01:00
const float adapter_scale = it . second ;
2025-01-12 11:32:42 +02:00
const float scale = lw - > get_scale ( it . first - > alpha , adapter_scale ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ab_cur = ggml_mul_mat (
2025-01-12 11:32:42 +02:00
ctx0 , lw - > b ,
ggml_mul_mat ( ctx0 , lw - > a , cur )
2025-01-03 10:18:53 +02:00
) ;
ab_cur = ggml_scale ( ctx0 , ab_cur , scale ) ;
res = ggml_add ( ctx0 , res , ab_cur ) ;
}
return res ;
}
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
// do mat_mul_id, while optionally apply lora
static struct ggml_tensor * llm_build_lora_mm_id (
struct llama_context & lctx ,
struct ggml_context * ctx0 ,
struct ggml_tensor * w , // struct ggml_tensor * as
struct ggml_tensor * cur , // struct ggml_tensor * b
struct ggml_tensor * ids ) {
struct ggml_tensor * res = ggml_mul_mat_id ( ctx0 , w , cur , ids ) ;
2025-01-12 11:32:42 +02:00
for ( auto & it : lctx . lora ) {
struct llama_adapter_lora_weight * lw = it . first - > get_weight ( w ) ;
if ( lw = = nullptr ) {
2025-01-03 10:18:53 +02:00
continue ;
}
const float alpha = it . first - > alpha ;
2025-01-12 11:32:42 +02:00
const float rank = ( float ) lw - > b - > ne [ 0 ] ;
2025-01-03 10:18:53 +02:00
const float scale = alpha ? it . second * alpha / rank : it . second ;
struct ggml_tensor * ab_cur = ggml_mul_mat_id (
2025-01-12 11:32:42 +02:00
ctx0 , lw - > b ,
ggml_mul_mat_id ( ctx0 , lw - > a , cur , ids ) ,
2025-01-03 10:18:53 +02:00
ids
) ;
ab_cur = ggml_scale ( ctx0 , ab_cur , scale ) ;
res = ggml_add ( ctx0 , res , ab_cur ) ;
}
return res ;
}
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_norm (
struct ggml_context * ctx ,
struct ggml_tensor * cur ,
const llama_hparams & hparams ,
struct ggml_tensor * mw ,
struct ggml_tensor * mb ,
llm_norm_type type ,
const llm_build_cb & cb ,
int il ) {
switch ( type ) {
case LLM_NORM : cur = ggml_norm ( ctx , cur , hparams . f_norm_eps ) ; break ;
case LLM_NORM_RMS : cur = ggml_rms_norm ( ctx , cur , hparams . f_norm_rms_eps ) ; break ;
case LLM_NORM_GROUP :
2024-03-15 16:41:22 -04:00
{
2025-01-03 10:18:53 +02:00
cur = ggml_reshape_3d ( ctx , cur , cur - > ne [ 0 ] , 1 , cur - > ne [ 1 ] ) ;
cur = ggml_group_norm ( ctx , cur , hparams . n_norm_groups , hparams . f_norm_group_eps ) ;
cur = ggml_reshape_2d ( ctx , cur , cur - > ne [ 0 ] , cur - > ne [ 2 ] ) ;
2024-03-15 16:41:22 -04:00
} break ;
2025-01-03 10:18:53 +02:00
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
if ( mw | | mb ) {
cb ( cur , " norm " , il ) ;
}
2024-11-19 01:04:08 -08:00
2025-01-03 10:18:53 +02:00
if ( mw ) {
cur = ggml_mul ( ctx , cur , mw ) ;
if ( mb ) {
cb ( cur , " norm_w " , il ) ;
}
}
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
if ( mb ) {
cur = ggml_add ( ctx , cur , mb ) ;
}
2024-06-24 02:27:57 +08:00
2025-01-03 10:18:53 +02:00
return cur ;
}
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_ffn (
struct ggml_context * ctx ,
struct llama_context & lctx ,
struct ggml_tensor * cur ,
struct ggml_tensor * up ,
struct ggml_tensor * up_b ,
struct ggml_tensor * up_s ,
struct ggml_tensor * gate ,
struct ggml_tensor * gate_b ,
struct ggml_tensor * gate_s ,
struct ggml_tensor * down ,
struct ggml_tensor * down_b ,
struct ggml_tensor * down_s ,
struct ggml_tensor * act_scales ,
llm_ffn_op_type type_op ,
llm_ffn_gate_type type_gate ,
const llm_build_cb & cb ,
int il ) {
struct ggml_tensor * tmp = up ? llm_build_lora_mm ( lctx , ctx , up , cur ) : cur ;
cb ( tmp , " ffn_up " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
if ( up_b ) {
tmp = ggml_add ( ctx , tmp , up_b ) ;
cb ( tmp , " ffn_up_b " , il ) ;
}
if ( up_s ) {
tmp = ggml_mul ( ctx , tmp , up_s ) ;
cb ( tmp , " ffn_up_s " , il ) ;
}
if ( gate ) {
switch ( type_gate ) {
case LLM_FFN_SEQ :
{
cur = llm_build_lora_mm ( lctx , ctx , gate , tmp ) ;
cb ( cur , " ffn_gate " , il ) ;
} break ;
case LLM_FFN_PAR :
{
cur = llm_build_lora_mm ( lctx , ctx , gate , cur ) ;
cb ( cur , " ffn_gate " , il ) ;
} break ;
}
if ( gate_b ) {
cur = ggml_add ( ctx , cur , gate_b ) ;
cb ( cur , " ffn_gate_b " , il ) ;
}
if ( gate_s ) {
cur = ggml_mul ( ctx , cur , gate_s ) ;
cb ( cur , " ffn_gate_s " , il ) ;
}
} else {
cur = tmp ;
}
switch ( type_op ) {
case LLM_FFN_SILU :
2024-08-10 11:43:26 +02:00
{
2025-01-03 10:18:53 +02:00
cur = ggml_silu ( ctx , cur ) ;
cb ( cur , " ffn_silu " , il ) ;
2024-08-10 11:43:26 +02:00
} break ;
2025-01-03 10:18:53 +02:00
case LLM_FFN_GELU :
2024-07-02 10:36:00 -04:00
{
2025-01-03 10:18:53 +02:00
cur = ggml_gelu ( ctx , cur ) ;
cb ( cur , " ffn_gelu " , il ) ;
if ( act_scales ! = NULL ) {
cur = ggml_div ( ctx , cur , act_scales ) ;
cb ( cur , " ffn_act " , il ) ;
2024-07-02 10:36:00 -04:00
}
} break ;
2025-01-03 10:18:53 +02:00
case LLM_FFN_RELU :
2024-08-15 19:23:33 -07:00
{
2025-01-03 10:18:53 +02:00
cur = ggml_relu ( ctx , cur ) ;
cb ( cur , " ffn_relu " , il ) ;
2024-08-15 19:23:33 -07:00
} break ;
2025-01-03 10:18:53 +02:00
case LLM_FFN_RELU_SQR :
2024-08-16 15:35:18 +09:00
{
2025-01-03 10:18:53 +02:00
cur = ggml_relu ( ctx , cur ) ;
cb ( cur , " ffn_relu " , il ) ;
2024-08-16 15:35:18 +09:00
2025-01-03 10:18:53 +02:00
cur = ggml_sqr ( ctx , cur ) ;
cb ( cur , " ffn_sqr(relu) " , il ) ;
2024-09-28 12:08:43 +00:00
} break ;
2025-01-03 10:18:53 +02:00
case LLM_FFN_SWIGLU :
2024-12-18 19:27:21 +02:00
{
2025-01-03 10:18:53 +02:00
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
int64_t split_point = cur - > ne [ 0 ] / 2 ;
struct ggml_tensor * x0 = ggml_cont ( ctx , ggml_view_2d ( ctx , cur , split_point , cur - > ne [ 1 ] , cur - > nb [ 1 ] , 0 ) ) ;
struct ggml_tensor * x1 = ggml_cont ( ctx , ggml_view_2d ( ctx , cur , split_point , cur - > ne [ 1 ] , cur - > nb [ 1 ] , split_point * ggml_element_size ( cur ) ) ) ;
x0 = ggml_silu ( ctx , x0 ) ;
cb ( cur , " ffn_silu " , il ) ;
cur = ggml_mul ( ctx , x0 , x1 ) ;
cb ( cur , " ffn_mul " , il ) ;
2024-12-18 19:27:21 +02:00
} break ;
2023-09-28 17:41:44 -04:00
}
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
if ( type_gate = = LLM_FFN_PAR ) {
cur = ggml_mul ( ctx , cur , tmp ) ;
cb ( cur , " ffn_gate_par " , il ) ;
2024-02-17 23:04:16 +02:00
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
if ( down ) {
cur = llm_build_lora_mm ( lctx , ctx , down , cur ) ;
}
2023-06-14 19:47:19 +02:00
2025-01-03 10:18:53 +02:00
if ( down_b ) {
cb ( cur , " ffn_down " , il ) ;
}
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
if ( down_b ) {
cur = ggml_add ( ctx , cur , down_b ) ;
}
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
if ( down_s ) {
cur = ggml_mul ( ctx , cur , down_s ) ;
cb ( cur , " ffn_down_s " , il ) ;
}
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
return cur ;
}
2024-09-17 12:18:22 +02:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_moe_ffn (
struct ggml_context * ctx ,
struct llama_context & lctx ,
struct ggml_tensor * cur ,
struct ggml_tensor * gate_inp ,
struct ggml_tensor * up_exps ,
struct ggml_tensor * gate_exps ,
struct ggml_tensor * down_exps ,
2025-01-04 21:06:11 +01:00
struct ggml_tensor * exp_probs_b ,
2025-01-03 10:18:53 +02:00
int64_t n_expert ,
int64_t n_expert_used ,
llm_ffn_op_type type_op ,
bool norm_w ,
bool scale_w ,
float w_scale ,
2025-01-04 21:06:11 +01:00
llama_expert_gating_func_type gating_op ,
2025-01-03 10:18:53 +02:00
const llm_build_cb & cb ,
int il ) {
int64_t n_embd = cur - > ne [ 0 ] ;
int64_t n_tokens = cur - > ne [ 1 ] ;
2024-07-22 13:33:22 +03:00
2025-01-03 10:18:53 +02:00
ggml_tensor * logits = llm_build_lora_mm ( lctx , ctx , gate_inp , cur ) ; // [n_expert, n_tokens]
cb ( logits , " ffn_moe_logits " , il ) ;
2023-08-21 23:07:43 +03:00
2025-01-04 21:06:11 +01:00
ggml_tensor * probs = nullptr ;
switch ( gating_op ) {
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX :
{
probs = ggml_soft_max ( ctx , logits ) ; // [n_expert, n_tokens]
} break ;
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID :
{
probs = ggml_sigmoid ( ctx , logits ) ; // [n_expert, n_tokens]
} break ;
default :
GGML_ABORT ( " fatal error " ) ;
}
2025-01-03 10:18:53 +02:00
cb ( probs , " ffn_moe_probs " , il ) ;
2023-08-21 23:07:43 +03:00
2025-01-04 21:06:11 +01:00
// add experts selection bias - introduced in DeepSeek V3
// leave probs unbiased as it's later used to get expert weights
ggml_tensor * selection_probs = probs ;
if ( exp_probs_b ! = nullptr ) {
selection_probs = ggml_add ( ctx , probs , exp_probs_b ) ;
cb ( selection_probs , " ffn_moe_probs_biased " , il ) ;
}
2025-01-03 10:18:53 +02:00
// select experts
2025-01-04 21:06:11 +01:00
ggml_tensor * selected_experts = ggml_top_k ( ctx , selection_probs , n_expert_used ) ; // [n_expert_used, n_tokens]
2025-01-03 10:18:53 +02:00
cb ( selected_experts - > src [ 0 ] , " ffn_moe_argsort " , il ) ;
cb ( selected_experts , " ffn_moe_topk " , il ) ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
ggml_tensor * weights = ggml_get_rows ( ctx ,
ggml_reshape_3d ( ctx , probs , 1 , n_expert , n_tokens ) , selected_experts ) ; // [1, n_expert_used, n_tokens]
cb ( weights , " ffn_moe_weights " , il ) ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
if ( norm_w ) {
weights = ggml_reshape_2d ( ctx , weights , n_expert_used , n_tokens ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
ggml_tensor * weights_sum = ggml_sum_rows ( ctx , weights ) ; // [1, n_tokens]
cb ( weights_sum , " ffn_moe_weights_sum " , il ) ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
weights = ggml_div ( ctx , weights , weights_sum ) ; // [n_expert_used, n_tokens]
cb ( weights , " ffn_moe_weights_norm " , il ) ;
2024-07-05 19:01:35 +02:00
2025-01-03 10:18:53 +02:00
weights = ggml_reshape_3d ( ctx , weights , 1 , n_expert_used , n_tokens ) ;
2023-05-12 00:23:08 +03:00
}
2025-01-03 10:18:53 +02:00
if ( scale_w ) {
weights = ggml_scale ( ctx , weights , w_scale ) ;
cb ( weights , " ffn_moe_weights_scaled " , il ) ;
2024-03-14 17:21:56 +01:00
}
2025-01-03 10:18:53 +02:00
cur = ggml_reshape_3d ( ctx , cur , n_embd , 1 , n_tokens ) ;
ggml_tensor * up = llm_build_lora_mm_id ( lctx , ctx , up_exps , cur , selected_experts ) ; // [n_ff, n_expert_used, n_tokens]
cb ( up , " ffn_moe_up " , il ) ;
2024-03-14 17:21:56 +01:00
2025-01-03 10:18:53 +02:00
ggml_tensor * gate = llm_build_lora_mm_id ( lctx , ctx , gate_exps , cur , selected_experts ) ; // [n_ff, n_expert_used, n_tokens]
cb ( gate , " ffn_moe_gate " , il ) ;
2024-03-14 17:21:56 +01:00
2025-01-03 10:18:53 +02:00
switch ( type_op ) {
case LLM_FFN_SILU :
{
gate = ggml_silu ( ctx , gate ) ;
cb ( gate , " ffn_moe_silu " , il ) ;
} break ;
case LLM_FFN_GELU :
{
gate = ggml_gelu ( ctx , gate ) ;
cb ( gate , " ffn_moe_gelu " , il ) ;
} break ;
default :
GGML_ABORT ( " fatal error " ) ;
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
ggml_tensor * par = ggml_mul ( ctx , up , gate ) ; // [n_ff, n_expert_used, n_tokens]
cb ( par , " ffn_moe_gate_par " , il ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
ggml_tensor * experts = llm_build_lora_mm_id ( lctx , ctx , down_exps , par , selected_experts ) ; // [n_embd, n_expert_used, n_tokens]
cb ( experts , " ffn_moe_down " , il ) ;
2023-03-29 02:03:43 +02:00
2025-01-03 10:18:53 +02:00
experts = ggml_mul ( ctx , experts , weights ) ;
2024-09-28 20:10:58 +08:00
2025-01-03 10:18:53 +02:00
// aggregate experts
ggml_tensor * moe_out = nullptr ;
for ( int i = 0 ; i < n_expert_used ; + + i ) {
ggml_tensor * cur_expert = ggml_view_2d ( ctx , experts , n_embd , n_tokens ,
experts - > nb [ 2 ] , i * experts - > nb [ 1 ] ) ;
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
if ( i = = 0 ) {
moe_out = cur_expert ;
2024-09-28 17:42:03 +03:00
} else {
2025-01-03 10:18:53 +02:00
moe_out = ggml_add ( ctx , moe_out , cur_expert ) ;
2024-09-28 17:42:03 +03:00
}
2023-08-27 14:19:19 +03:00
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
if ( n_expert_used = = 1 ) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont ( ctx , moe_out ) ;
}
2023-11-16 19:14:37 -07:00
2025-01-03 10:18:53 +02:00
return moe_out ;
}
2023-12-05 10:19:18 -07:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_kqv (
struct ggml_context * ctx ,
struct llama_context & lctx ,
const llama_kv_cache & kv ,
struct ggml_cgraph * graph ,
struct ggml_tensor * wo ,
struct ggml_tensor * wo_b ,
struct ggml_tensor * q_cur ,
struct ggml_tensor * kq_mask ,
int32_t n_tokens ,
int32_t n_kv ,
float kq_scale ,
const llm_build_cb & cb ,
int il ) {
const llama_model & model = lctx . model ;
const llama_hparams & hparams = lctx . model . hparams ;
const llama_cparams & cparams = lctx . cparams ;
2024-04-21 13:50:41 +02:00
2025-01-03 10:18:53 +02:00
const int64_t n_ctx = cparams . n_ctx ;
const int64_t n_head = hparams . n_head ( il ) ;
const int64_t n_head_kv = hparams . n_head_kv ( il ) ;
const int64_t n_embd_head_k = hparams . n_embd_head_k ;
const int64_t n_embd_k_gqa = hparams . n_embd_k_gqa ( il ) ;
const int64_t n_embd_head_v = hparams . n_embd_head_v ;
const int64_t n_embd_v_gqa = hparams . n_embd_v_gqa ( il ) ;
2024-08-05 09:38:01 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q = ggml_permute ( ctx , q_cur , 0 , 2 , 1 , 3 ) ;
cb ( q , " q " , il ) ;
2024-10-12 08:21:51 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * k =
ggml_view_3d ( ctx , kv . k_l [ il ] ,
n_embd_head_k , n_kv , n_head_kv ,
ggml_row_size ( kv . k_l [ il ] - > type , n_embd_k_gqa ) ,
ggml_row_size ( kv . k_l [ il ] - > type , n_embd_head_k ) ,
0 ) ;
cb ( k , " k " , il ) ;
2024-10-12 08:21:51 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
2024-10-12 08:21:51 +03:00
2025-01-03 10:18:53 +02:00
if ( cparams . flash_attn ) {
GGML_UNUSED ( model ) ;
GGML_UNUSED ( n_ctx ) ;
2024-10-12 08:21:51 +03:00
2025-01-03 10:18:53 +02:00
// split cached v into n_head heads (not transposed)
struct ggml_tensor * v =
ggml_view_3d ( ctx , kv . v_l [ il ] ,
n_embd_head_v , n_kv , n_head_kv ,
ggml_row_size ( kv . v_l [ il ] - > type , n_embd_v_gqa ) ,
ggml_row_size ( kv . v_l [ il ] - > type , n_embd_head_v ) ,
0 ) ;
cb ( v , " v " , il ) ;
2024-10-12 08:21:51 +03:00
2025-01-03 10:18:53 +02:00
cur = ggml_flash_attn_ext ( ctx , q , k , v , kq_mask , kq_scale , hparams . f_max_alibi_bias ,
hparams . attn_soft_cap ? hparams . f_attn_logit_softcapping : 0.0f ) ;
2024-10-12 08:21:51 +03:00
2025-01-03 10:18:53 +02:00
ggml_flash_attn_ext_set_prec ( cur , GGML_PREC_F32 ) ;
2024-09-24 10:16:06 +03:00
2025-01-03 10:18:53 +02:00
cur = ggml_reshape_2d ( ctx , cur , n_embd_head_v * n_head , n_tokens ) ;
} else {
struct ggml_tensor * kq = ggml_mul_mat ( ctx , k , q ) ;
cb ( kq , " kq " , il ) ;
2024-10-13 21:31:35 +03:00
2025-01-03 10:18:53 +02:00
// note: this op tends to require high floating point range
// while for some models F16 is enough, for others it is not, so we default to F32 here
ggml_mul_mat_set_prec ( kq , GGML_PREC_F32 ) ;
2024-10-13 21:31:35 +03:00
2025-01-03 10:18:53 +02:00
if ( model . arch = = LLM_ARCH_GROK ) {
// need to do the following:
// multiply by attn_output_multiplyer of 0.08838834764831845
// and then :
// kq = 30 * tanh(kq / 30)
// before the softmax below
2024-10-13 21:31:35 +03:00
2025-01-03 10:18:53 +02:00
kq = ggml_tanh ( ctx , ggml_scale ( ctx , kq , 0.08838834764831845f / 30.0f ) ) ;
kq = ggml_scale ( ctx , kq , 30 ) ;
2024-10-13 21:31:35 +03:00
}
2025-01-03 10:18:53 +02:00
if ( hparams . attn_soft_cap ) {
kq = ggml_scale ( ctx , kq , 1.0f / hparams . f_attn_logit_softcapping ) ;
kq = ggml_tanh ( ctx , kq ) ;
kq = ggml_scale ( ctx , kq , hparams . f_attn_logit_softcapping ) ;
2024-09-24 10:16:06 +03:00
}
2025-01-03 10:18:53 +02:00
kq = ggml_soft_max_ext ( ctx , kq , kq_mask , kq_scale , hparams . f_max_alibi_bias ) ;
cb ( kq , " kq_soft_max_ext " , il ) ;
2024-09-24 10:16:06 +03:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( kv . size = = n_ctx ) ;
2024-09-24 10:16:06 +03:00
2025-01-03 10:18:53 +02:00
// split cached v into n_head heads
struct ggml_tensor * v =
ggml_view_3d ( ctx , kv . v_l [ il ] ,
n_kv , n_embd_head_v , n_head_kv ,
ggml_element_size ( kv . v_l [ il ] ) * n_ctx ,
ggml_element_size ( kv . v_l [ il ] ) * n_ctx * n_embd_head_v ,
0 ) ;
cb ( v , " v " , il ) ;
2023-10-17 17:11:01 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * kqv = ggml_mul_mat ( ctx , v , kq ) ;
cb ( kqv , " kqv " , il ) ;
2023-10-17 17:11:01 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * kqv_merged = ggml_permute ( ctx , kqv , 0 , 2 , 1 , 3 ) ;
cb ( kqv_merged , " kqv_merged " , il ) ;
2023-10-17 17:11:01 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_cont_2d ( ctx , kqv_merged , n_embd_head_v * n_head , n_tokens ) ;
cb ( cur , " kqv_merged_cont " , il ) ;
2024-05-30 19:01:41 +03:00
}
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( graph , cur ) ;
2024-05-30 19:01:41 +03:00
2025-01-03 10:18:53 +02:00
if ( wo ) {
cur = llm_build_lora_mm ( lctx , ctx , wo , cur ) ;
}
2024-05-30 19:01:41 +03:00
2025-01-03 10:18:53 +02:00
if ( wo_b ) {
cb ( cur , " kqv_wo " , il ) ;
}
2024-05-30 19:01:41 +03:00
2025-01-03 10:18:53 +02:00
if ( wo_b ) {
cur = ggml_add ( ctx , cur , wo_b ) ;
2023-10-17 17:11:01 +02:00
}
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
return cur ;
}
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_kv (
struct ggml_context * ctx ,
struct llama_context & lctx ,
const llama_kv_cache & kv ,
struct ggml_cgraph * graph ,
struct ggml_tensor * wo ,
struct ggml_tensor * wo_b ,
struct ggml_tensor * k_cur ,
struct ggml_tensor * v_cur ,
struct ggml_tensor * q_cur ,
struct ggml_tensor * kq_mask ,
int32_t n_tokens ,
int32_t kv_head ,
int32_t n_kv ,
float kq_scale ,
const llm_build_cb & cb ,
int il ) {
const llama_hparams & hparams = lctx . model . hparams ;
const llama_cparams & cparams = lctx . cparams ;
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand ( graph , q_cur ) ;
ggml_build_forward_expand ( graph , k_cur ) ;
ggml_build_forward_expand ( graph , v_cur ) ;
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
llm_build_kv_store ( ctx , hparams , cparams , kv , graph , k_cur , v_cur , n_tokens , kv_head , cb , il ) ;
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kqv ( ctx , lctx , kv , graph , wo , wo_b , q_cur , kq_mask , n_tokens , n_kv , kq_scale , cb , il ) ;
cb ( cur , " kqv_out " , il ) ;
2024-06-04 09:17:17 +02:00
2025-01-03 10:18:53 +02:00
return cur ;
2023-08-23 23:08:04 +03:00
}
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_copy_mask_state (
struct ggml_context * ctx ,
struct ggml_cgraph * graph ,
struct ggml_tensor * s ,
struct ggml_tensor * state_copy ,
struct ggml_tensor * state_mask ,
int32_t n_state ,
int32_t kv_size ,
int32_t kv_head ,
int32_t n_kv ,
int32_t n_seqs ) {
struct ggml_tensor * states = ggml_reshape_2d ( ctx , s , n_state , kv_size ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
// copy states
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
// this shrinks the tensors's ne[1] to n_kv
states = ggml_get_rows ( ctx , states , state_copy ) ;
2023-11-01 18:04:33 -04:00
2025-01-03 10:18:53 +02:00
// clear states of sequences which are starting at the beginning of this batch
// FIXME: zero-out NANs?
states = ggml_mul ( ctx , states , state_mask ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
// copy states which won't be changed further (between n_seqs and n_kv)
ggml_build_forward_expand ( graph ,
ggml_cpy ( ctx ,
ggml_view_1d ( ctx , states , n_state * ( n_kv - n_seqs ) , n_seqs * n_state * ggml_element_size ( states ) ) ,
ggml_view_1d ( ctx , s , n_state * ( n_kv - n_seqs ) , ( kv_head + n_seqs ) * n_state * ggml_element_size ( s ) ) ) ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
// the part of the states that will be used and modified
return ggml_view_2d ( ctx , states , n_state , n_seqs , states - > nb [ 1 ] , 0 ) ;
}
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
// TODO: split
static struct ggml_tensor * llm_build_mamba (
struct ggml_context * ctx ,
struct llama_context & lctx ,
2025-01-06 10:28:17 +01:00
const llama_ubatch & ubatch ,
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * graph ,
struct ggml_tensor * cur ,
struct ggml_tensor * state_copy ,
struct ggml_tensor * state_mask ,
int32_t kv_head ,
int32_t n_kv ,
const llm_build_cb & cb ,
int il ) {
const llama_model & model = lctx . model ;
const llama_hparams & hparams = model . hparams ;
const llama_kv_cache & kv = lctx . kv_self ;
const int64_t d_conv = hparams . ssm_d_conv ;
const int64_t d_inner = hparams . ssm_d_inner ;
const int64_t d_state = hparams . ssm_d_state ;
const int64_t dt_rank = hparams . ssm_dt_rank ;
2025-01-06 10:28:17 +01:00
const int64_t n_seqs = ubatch . n_seqs ;
2025-01-03 10:18:53 +02:00
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
const bool ssm_dt_b_c_rms = hparams . ssm_dt_b_c_rms ;
// Use the same RMS norm as the final layer norm
const float norm_rms_eps = hparams . f_norm_rms_eps ;
2024-07-05 05:14:21 +12:00
2025-01-06 10:28:17 +01:00
const int64_t n_seq_tokens = ubatch . n_seq_tokens ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_seqs ! = 0 ) ;
2025-01-06 10:28:17 +01:00
GGML_ASSERT ( ubatch . equal_seqs ) ;
GGML_ASSERT ( ubatch . n_tokens = = n_seq_tokens * n_seqs ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * conv_states_all = kv . k_l [ il ] ;
struct ggml_tensor * ssm_states_all = kv . v_l [ il ] ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
// (ab)using the KV cache to store the states
struct ggml_tensor * conv = llm_build_copy_mask_state ( ctx ,
graph , conv_states_all , state_copy , state_mask ,
hparams . n_embd_k_s ( ) , kv . size , kv_head , n_kv , n_seqs ) ;
conv = ggml_reshape_3d ( ctx , conv , d_conv - 1 , d_inner , n_seqs ) ;
struct ggml_tensor * ssm = llm_build_copy_mask_state ( ctx ,
graph , ssm_states_all , state_copy , state_mask ,
hparams . n_embd_v_s ( ) , kv . size , kv_head , n_kv , n_seqs ) ;
ssm = ggml_reshape_3d ( ctx , ssm , d_state , d_inner , n_seqs ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
cur = ggml_reshape_3d ( ctx , cur , cur - > ne [ 0 ] , n_seq_tokens , n_seqs ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
struct ggml_tensor * xz = llm_build_lora_mm ( lctx , ctx , model . layers [ il ] . ssm_in , cur ) ;
// split the above in two
// => {d_inner, n_seq_tokens, n_seqs}
struct ggml_tensor * x = ggml_view_3d ( ctx , xz , d_inner , xz - > ne [ 1 ] , xz - > ne [ 2 ] , xz - > nb [ 1 ] , xz - > nb [ 2 ] , 0 ) ;
struct ggml_tensor * z = ggml_view_3d ( ctx , xz , d_inner , xz - > ne [ 1 ] , xz - > ne [ 2 ] , xz - > nb [ 1 ] , xz - > nb [ 2 ] , d_inner * ggml_element_size ( xz ) ) ;
2024-06-21 08:51:28 +03:00
2025-01-03 10:18:53 +02:00
// conv
{
// => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
struct ggml_tensor * conv_x = ggml_concat ( ctx , conv , ggml_transpose ( ctx , x ) , 0 ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
// copy last (d_conv - 1) columns back into the state cache
struct ggml_tensor * last_conv = ggml_view_3d ( ctx , conv_x , d_conv - 1 , d_inner , n_seqs , conv_x - > nb [ 1 ] , conv_x - > nb [ 2 ] , n_seq_tokens * ( conv_x - > nb [ 0 ] ) ) ;
2024-06-17 22:08:46 +03:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( graph ,
ggml_cpy ( ctx , last_conv ,
ggml_view_1d ( ctx , conv_states_all ,
( d_conv - 1 ) * ( d_inner ) * ( n_seqs ) ,
kv_head * ( d_conv - 1 ) * ( d_inner ) * ggml_element_size ( conv_states_all ) ) ) ) ;
// 1D convolution
// The equivalent is to make a self-overlapping view of conv_x
// over d_conv columns at each stride in the 3rd dimension,
// then element-wise multiply that with the conv1d weight,
// then sum the elements of each row,
// (the last two steps are a dot product over rows (also doable with mul_mat))
// then permute away the ne[0] dimension,
// and then you're left with the resulting x tensor.
// For simultaneous sequences, all sequences need to have the same length.
x = ggml_ssm_conv ( ctx , conv_x , model . layers [ il ] . ssm_conv1d ) ;
// bias
x = ggml_add ( ctx , x , model . layers [ il ] . ssm_conv1d_b ) ;
2024-09-17 00:44:58 -06:00
2025-01-03 10:18:53 +02:00
x = ggml_silu ( ctx , x ) ;
2024-09-17 00:44:58 -06:00
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
// ssm
{
// {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
struct ggml_tensor * x_db = llm_build_lora_mm ( lctx , ctx , model . layers [ il ] . ssm_x , x ) ;
// split
struct ggml_tensor * dt = ggml_view_3d ( ctx , x_db , dt_rank , n_seq_tokens , n_seqs , x_db - > nb [ 1 ] , x_db - > nb [ 2 ] , 0 ) ;
struct ggml_tensor * B = ggml_view_3d ( ctx , x_db , d_state , n_seq_tokens , n_seqs , x_db - > nb [ 1 ] , x_db - > nb [ 2 ] , ggml_element_size ( x_db ) * dt_rank ) ;
struct ggml_tensor * C = ggml_view_3d ( ctx , x_db , d_state , n_seq_tokens , n_seqs , x_db - > nb [ 1 ] , x_db - > nb [ 2 ] , ggml_element_size ( x_db ) * ( dt_rank + d_state ) ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
if ( ssm_dt_b_c_rms ) {
dt = ggml_rms_norm ( ctx , dt , norm_rms_eps ) ;
B = ggml_rms_norm ( ctx , B , norm_rms_eps ) ;
C = ggml_rms_norm ( ctx , C , norm_rms_eps ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
dt = llm_build_lora_mm ( lctx , ctx , model . layers [ il ] . ssm_dt , dt ) ;
dt = ggml_add ( ctx , dt , model . layers [ il ] . ssm_dt_b ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// Custom operator to optimize the parallel associative scan
// as described in the Annex D of the Mamba paper.
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
struct ggml_tensor * y_ssm = ggml_ssm_scan ( ctx , ssm , x , dt , model . layers [ il ] . ssm_a , B , C ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// store last states
ggml_build_forward_expand ( graph ,
ggml_cpy ( ctx ,
ggml_view_1d ( ctx , y_ssm , d_state * d_inner * n_seqs , x - > nb [ 3 ] ) ,
ggml_view_1d ( ctx , ssm_states_all , d_state * d_inner * n_seqs , kv_head * d_state * d_inner * ggml_element_size ( ssm_states_all ) ) ) ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * y = ggml_view_3d ( ctx , y_ssm , d_inner , n_seq_tokens , n_seqs , x - > nb [ 1 ] , x - > nb [ 2 ] , 0 ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// TODO: skip computing output earlier for unused tokens
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
y = ggml_add ( ctx , y , ggml_mul ( ctx , x , model . layers [ il ] . ssm_d ) ) ;
y = ggml_mul ( ctx , y , ggml_silu ( ctx , ggml_cont ( ctx , z ) ) ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
cur = llm_build_lora_mm ( lctx , ctx , model . layers [ il ] . ssm_out , y ) ;
2024-10-30 02:01:23 +01:00
}
2025-01-03 10:18:53 +02:00
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
cur = ggml_reshape_2d ( ctx , cur , cur - > ne [ 0 ] , n_seq_tokens * n_seqs ) ;
cb ( cur , " mamba_out " , il ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
return cur ;
2024-10-30 02:01:23 +01:00
}
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_rwkv6_time_mix (
struct llama_context & lctx ,
struct ggml_context * ctx ,
const struct llama_layer * layer ,
struct ggml_tensor * cur ,
struct ggml_tensor * x_prev ,
2025-01-10 09:58:08 +08:00
struct ggml_tensor * * wkv_state ,
size_t wkv_head_size ,
size_t head_count_kv ) {
2025-01-03 10:18:53 +02:00
size_t n_embd = cur - > ne [ 0 ] ;
size_t n_seq_tokens = cur - > ne [ 1 ] ;
size_t n_seqs = cur - > ne [ 2 ] ;
2024-10-30 02:01:23 +01:00
2025-01-10 09:58:08 +08:00
size_t head_size = wkv_head_size ;
size_t head_count = n_embd / head_size ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
size_t n_tokens = n_seqs * n_seq_tokens ;
2024-10-30 02:01:23 +01:00
2025-01-10 09:58:08 +08:00
bool is_qrwkv = layer - > time_mix_first = = nullptr ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * sx = ggml_sub ( ctx , x_prev , cur ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
sx = ggml_reshape_2d ( ctx , sx , n_embd , n_tokens ) ;
cur = ggml_reshape_2d ( ctx , cur , n_embd , n_tokens ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * xxx = ggml_add ( ctx , ggml_mul ( ctx , sx , layer - > time_mix_lerp_x ) , cur ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
xxx = ggml_reshape_4d (
ctx ,
ggml_tanh (
ctx ,
ggml_mul_mat ( ctx , layer - > time_mix_w1 , xxx )
) ,
layer - > time_mix_w1 - > ne [ 1 ] / 5 , 1 , 5 , n_tokens
) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
xxx = ggml_cont ( ctx , ggml_permute ( ctx , xxx , 0 , 1 , 3 , 2 ) ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
xxx = ggml_mul_mat (
ctx ,
ggml_reshape_4d (
ctx ,
layer - > time_mix_w2 ,
layer - > time_mix_w2 - > ne [ 0 ] , layer - > time_mix_w2 - > ne [ 1 ] , 1 , 5
) ,
xxx
) ;
2023-08-23 23:08:04 +03:00
2025-01-10 09:58:08 +08:00
struct ggml_tensor * xw , * xk , * xv , * xr , * xg ;
if ( layer - > time_mix_lerp_fused ) {
// fusing these weights makes some performance improvement
sx = ggml_reshape_3d ( ctx , sx , n_embd , 1 , n_tokens ) ;
cur = ggml_reshape_3d ( ctx , cur , n_embd , 1 , n_tokens ) ;
xxx = ggml_add ( ctx , ggml_mul ( ctx , ggml_add ( ctx , xxx , layer - > time_mix_lerp_fused ) , sx ) , cur ) ;
xw = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , 0 ) ;
xk = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * sizeof ( float ) ) ;
xv = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * 2 * sizeof ( float ) ) ;
xr = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * 3 * sizeof ( float ) ) ;
xg = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * 4 * sizeof ( float ) ) ;
} else {
// for backward compatibility
xw = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , 0 ) ;
xk = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * sizeof ( float ) ) ;
xv = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * 2 * sizeof ( float ) ) ;
xr = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * 3 * sizeof ( float ) ) ;
xg = ggml_view_2d ( ctx , xxx , n_embd , n_tokens , xxx - > nb [ 1 ] , n_embd * n_tokens * 4 * sizeof ( float ) ) ;
xw = ggml_add ( ctx , ggml_mul ( ctx , ggml_add ( ctx , xw , layer - > time_mix_lerp_w ) , sx ) , cur ) ;
xk = ggml_add ( ctx , ggml_mul ( ctx , ggml_add ( ctx , xk , layer - > time_mix_lerp_k ) , sx ) , cur ) ;
xv = ggml_add ( ctx , ggml_mul ( ctx , ggml_add ( ctx , xv , layer - > time_mix_lerp_v ) , sx ) , cur ) ;
xr = ggml_add ( ctx , ggml_mul ( ctx , ggml_add ( ctx , xr , layer - > time_mix_lerp_r ) , sx ) , cur ) ;
xg = ggml_add ( ctx , ggml_mul ( ctx , ggml_add ( ctx , xg , layer - > time_mix_lerp_g ) , sx ) , cur ) ;
}
2024-01-12 20:07:38 +01:00
2025-01-10 09:58:08 +08:00
struct ggml_tensor * r = llm_build_lora_mm ( lctx , ctx , layer - > time_mix_receptance , xr ) ;
struct ggml_tensor * k = llm_build_lora_mm ( lctx , ctx , layer - > time_mix_key , xk ) ;
struct ggml_tensor * v = llm_build_lora_mm ( lctx , ctx , layer - > time_mix_value , xv ) ;
if ( layer - > time_mix_receptance_b ) {
r = ggml_add ( ctx , r , layer - > time_mix_receptance_b ) ;
}
if ( layer - > time_mix_key_b ) {
k = ggml_add ( ctx , k , layer - > time_mix_key_b ) ;
}
if ( layer - > time_mix_value_b ) {
v = ggml_add ( ctx , v , layer - > time_mix_value_b ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-10 09:58:08 +08:00
struct ggml_tensor * g = llm_build_lora_mm ( lctx , ctx , layer - > time_mix_gate , xg ) ;
if ( is_qrwkv ) {
g = ggml_sigmoid ( ctx , g ) ;
} else {
g = ggml_silu ( ctx , g ) ;
}
2024-01-12 20:07:38 +01:00
2025-01-10 09:58:08 +08:00
if ( head_count_kv ! = head_count ) {
GGML_ASSERT ( head_count % head_count_kv = = 0 ) ;
k = ggml_reshape_4d ( ctx , k , head_size , 1 , head_count_kv , n_tokens ) ;
v = ggml_reshape_4d ( ctx , v , head_size , 1 , head_count_kv , n_tokens ) ;
struct ggml_tensor * tmp = ggml_new_tensor_4d ( ctx , GGML_TYPE_F32 , head_size , head_count / head_count_kv , head_count_kv , n_tokens ) ;
k = ggml_repeat ( ctx , k , tmp ) ;
v = ggml_repeat ( ctx , v , tmp ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-10 09:58:08 +08:00
k = ggml_reshape_3d ( ctx , k , head_size , head_count , n_tokens ) ;
v = ggml_reshape_3d ( ctx , v , head_size , head_count , n_tokens ) ;
r = ggml_reshape_3d ( ctx , r , head_size , head_count , n_tokens ) ;
2024-01-12 20:07:38 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * w = ggml_mul_mat (
ctx ,
layer - > time_mix_decay_w2 ,
ggml_tanh (
ctx ,
ggml_mul_mat ( ctx , layer - > time_mix_decay_w1 , xw )
)
) ;
2024-10-30 02:01:23 +01:00
2025-01-10 09:58:08 +08:00
w = ggml_add ( ctx , w , layer - > time_mix_decay ) ;
2025-01-03 10:18:53 +02:00
w = ggml_exp ( ctx , ggml_neg ( ctx , ggml_exp ( ctx , w ) ) ) ;
2025-01-10 09:58:08 +08:00
w = ggml_reshape_3d ( ctx , w , head_size , head_count , n_tokens ) ;
2024-04-03 15:07:05 +02:00
2025-01-10 09:58:08 +08:00
if ( is_qrwkv ) {
// k = k * (1 - w)
k = ggml_sub ( ctx , k , ggml_mul ( ctx , k , w ) ) ;
}
2024-04-03 15:07:05 +02:00
2025-01-10 09:58:08 +08:00
struct ggml_tensor * wkv_output ;
if ( ! layer - > time_mix_first ) {
wkv_output = ggml_gated_linear_attn ( ctx , k , v , r , w , * wkv_state , pow ( head_size , - 0.5f ) ) ;
} else {
wkv_output = ggml_rwkv_wkv6 ( ctx , k , v , r , layer - > time_mix_first , w , * wkv_state ) ;
}
2025-01-03 10:18:53 +02:00
cur = ggml_view_1d ( ctx , wkv_output , n_embd * n_tokens , 0 ) ;
* wkv_state = ggml_view_1d ( ctx , wkv_output , n_embd * head_size * n_seqs , n_embd * n_tokens * sizeof ( float ) ) ;
2024-04-03 15:07:05 +02:00
2025-01-10 09:58:08 +08:00
if ( ! is_qrwkv ) {
// group norm with head_count groups
cur = ggml_reshape_3d ( ctx , cur , n_embd / head_count , head_count , n_tokens ) ;
cur = ggml_norm ( ctx , cur , 64e-5 f ) ;
2024-10-30 02:01:23 +01:00
2025-01-10 09:58:08 +08:00
// Convert back to regular vectors.
cur = ggml_reshape_2d ( ctx , cur , n_embd , n_tokens ) ;
cur = ggml_add ( ctx , ggml_mul ( ctx , cur , layer - > time_mix_ln ) , layer - > time_mix_ln_b ) ;
} else {
cur = ggml_reshape_2d ( ctx , cur , n_embd , n_tokens ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
cur = ggml_mul ( ctx , cur , g ) ;
cur = llm_build_lora_mm ( lctx , ctx , layer - > time_mix_output , cur ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
return ggml_reshape_3d ( ctx , cur , n_embd , n_seq_tokens , n_seqs ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
static struct ggml_tensor * llm_build_rwkv6_channel_mix (
struct llama_context & lctx ,
struct ggml_context * ctx ,
const struct llama_layer * layer ,
struct ggml_tensor * cur ,
struct ggml_tensor * x_prev ) {
struct ggml_tensor * sx = ggml_sub ( ctx , x_prev , cur ) ;
struct ggml_tensor * xk = ggml_add ( ctx , ggml_mul ( ctx , sx , layer - > channel_mix_lerp_k ) , cur ) ;
struct ggml_tensor * xr = ggml_add ( ctx , ggml_mul ( ctx , sx , layer - > channel_mix_lerp_r ) , cur ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * r = ggml_sigmoid ( ctx , llm_build_lora_mm ( lctx , ctx , layer - > channel_mix_receptance , xr ) ) ;
struct ggml_tensor * k = ggml_sqr (
ctx ,
ggml_relu (
ctx ,
llm_build_lora_mm ( lctx , ctx , layer - > channel_mix_key , xk )
)
) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
return ggml_mul ( ctx , r , llm_build_lora_mm ( lctx , ctx , layer - > channel_mix_value , k ) ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
struct llm_build_context {
const llama_model & model ;
llama_context & lctx ;
const llama_hparams & hparams ;
const llama_cparams & cparams ;
const llama_ubatch & ubatch ;
const llama_kv_cache & kv_self ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd ;
const int64_t n_layer ;
const int64_t n_rot ;
const int64_t n_ctx ; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head ;
const int64_t n_head_kv ;
const int64_t n_embd_head_k ;
const int64_t n_embd_k_gqa ;
const int64_t n_embd_head_v ;
const int64_t n_embd_v_gqa ;
const int64_t n_expert ;
const int64_t n_expert_used ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
const float freq_base ;
const float freq_scale ;
const float ext_factor ;
const float attn_factor ;
const float beta_fast ;
const float beta_slow ;
const float norm_eps ;
const float norm_rms_eps ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
const int32_t n_tokens ;
const int32_t n_kv ; // size of KV cache to consider (n_kv <= kv_self.size)
const int32_t n_outputs ;
const int32_t n_outputs_enc ;
const int32_t kv_head ; // index of where we store new KV data in the cache
const int32_t n_ctx_orig ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
const bool flash_attn ;
2024-01-12 20:07:38 +01:00
2025-01-03 10:18:53 +02:00
const enum llama_pooling_type pooling_type ;
const enum llama_rope_type rope_type ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
const llm_build_cb & cb ;
2024-01-12 20:07:38 +01:00
2025-01-03 10:18:53 +02:00
std : : vector < uint8_t > & buf_compute_meta ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_context * ctx0 = nullptr ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
// TODO: consider making the entire interface noexcept
llm_build_context (
llama_context & lctx ,
const llama_ubatch & ubatch ,
const llm_build_cb & cb ,
bool worst_case ) :
model ( lctx . model ) ,
lctx ( lctx ) ,
hparams ( model . hparams ) ,
cparams ( lctx . cparams ) ,
ubatch ( ubatch ) ,
kv_self ( lctx . kv_self ) ,
n_embd ( hparams . n_embd ) ,
n_layer ( hparams . n_layer ) ,
n_rot ( hparams . n_rot ) ,
n_ctx ( cparams . n_ctx ) ,
n_head ( hparams . n_head ( ) ) ,
n_head_kv ( hparams . n_head_kv ( ) ) ,
n_embd_head_k ( hparams . n_embd_head_k ) ,
n_embd_k_gqa ( hparams . n_embd_k_gqa ( ) ) ,
n_embd_head_v ( hparams . n_embd_head_v ) ,
n_embd_v_gqa ( hparams . n_embd_v_gqa ( ) ) ,
n_expert ( hparams . n_expert ) ,
n_expert_used ( hparams . n_expert_used ) ,
freq_base ( cparams . rope_freq_base ) ,
freq_scale ( cparams . rope_freq_scale ) ,
ext_factor ( cparams . yarn_ext_factor ) ,
attn_factor ( cparams . yarn_attn_factor ) ,
beta_fast ( cparams . yarn_beta_fast ) ,
beta_slow ( cparams . yarn_beta_slow ) ,
norm_eps ( hparams . f_norm_eps ) ,
norm_rms_eps ( hparams . f_norm_rms_eps ) ,
n_tokens ( ubatch . n_tokens ) ,
n_kv ( worst_case ? kv_self . size : kv_self . n ) ,
n_outputs ( worst_case ? n_tokens : lctx . n_outputs ) ,
n_outputs_enc ( worst_case ? n_tokens : lctx . embd_enc . size ( ) / hparams . n_embd ) ,
kv_head ( worst_case ? ( kv_self . recurrent ? 0 : kv_self . size - n_tokens ) : kv_self . head ) ,
n_ctx_orig ( cparams . n_ctx_orig_yarn ) ,
flash_attn ( cparams . flash_attn ) ,
pooling_type ( cparams . pooling_type ) ,
rope_type ( hparams . rope_type ) ,
cb ( cb ) ,
buf_compute_meta ( lctx . buf_compute_meta ) {
// all initializations should be done in init()
}
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
void init ( ) {
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute_meta . size ( ) ,
/*.mem_buffer =*/ buf_compute_meta . data ( ) ,
/*.no_alloc =*/ true ,
} ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2025-01-03 10:18:53 +02:00
ctx0 = ggml_init ( params ) ;
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
lctx . inp_tokens = nullptr ;
lctx . inp_embd = nullptr ;
lctx . inp_pos = nullptr ;
lctx . inp_out_ids = nullptr ;
lctx . inp_KQ_mask = nullptr ;
lctx . inp_KQ_mask_swa = nullptr ;
lctx . inp_K_shift = nullptr ;
lctx . inp_mean = nullptr ;
lctx . inp_cls = nullptr ;
lctx . inp_s_copy = nullptr ;
lctx . inp_s_mask = nullptr ;
lctx . inp_s_seq = nullptr ;
lctx . inp_pos_bucket = nullptr ;
lctx . inp_embd_enc = nullptr ;
lctx . inp_KQ_mask_cross = nullptr ;
}
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
void free ( ) {
ggml_free ( ctx0 ) ;
ctx0 = nullptr ;
}
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_k_shift ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-12-02 02:17:06 +08:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( kv_self . size = = n_ctx ) ;
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
lctx . inp_K_shift = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_ctx ) ;
cb ( lctx . inp_K_shift , " K_shift " , - 1 ) ;
ggml_set_input ( lctx . inp_K_shift ) ;
2024-07-27 05:03:45 -07:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
const int64_t n_head_kv = hparams . n_head_kv ( il ) ;
const int64_t n_embd_k_gqa = hparams . n_embd_k_gqa ( il ) ;
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
struct ggml_tensor * k =
ggml_view_3d ( ctx0 , kv_self . k_l [ il ] ,
n_embd_head_k , n_head_kv , n_ctx ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_head_k ) ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa ) ,
0 ) ;
2024-05-28 20:49:49 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * tmp ;
if ( ggml_is_quantized ( k - > type ) ) {
// dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast ( ctx0 , k , GGML_TYPE_F32 ) ;
cb ( tmp , " K_f32 " , il ) ;
for ( auto & backend : lctx . backends ) {
// Figure out which backend KV cache belongs to
if ( ggml_backend_supports_buft ( backend . get ( ) , ggml_backend_buffer_get_type ( kv_self . k_l [ il ] - > buffer ) ) ) {
ggml_backend_sched_set_tensor_backend ( lctx . sched . get ( ) , tmp , backend . get ( ) ) ;
break ;
2023-09-15 00:32:10 +08:00
}
2025-01-03 10:18:53 +02:00
}
tmp = ggml_rope_ext_inplace ( ctx0 , tmp ,
lctx . inp_K_shift , rope_factors , n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
cb ( tmp , " K_shifted_f32 " , il ) ;
tmp = ggml_cpy ( ctx0 , tmp , k ) ;
} else {
// we rotate only the first n_rot dimensions
tmp = ggml_rope_ext_inplace ( ctx0 , k ,
lctx . inp_K_shift , rope_factors , n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
}
cb ( tmp , " K_shifted " , il ) ;
ggml_build_forward_expand ( gf , tmp ) ;
}
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_defrag ( const std : : vector < uint32_t > & ids ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
for ( uint32_t i = 0 ; i < ids . size ( ) ; + + i ) {
const uint32_t id = ids [ i ] ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
if ( i = = id | | id = = ids . size ( ) ) {
continue ;
}
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
uint32_t nm = 1 ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
while ( i + nm < ids . size ( ) & & ids [ i + nm ] = = id + nm ) {
nm + + ;
}
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
const int64_t n_embd_k_gqa = hparams . n_embd_k_gqa ( il ) ;
const int64_t n_embd_v_gqa = hparams . n_embd_v_gqa ( il ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
ggml_tensor * view_k_src = ggml_view_2d ( ctx0 , kv_self . k_l [ il ] ,
n_embd_k_gqa , nm ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa ) ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa * i ) ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
ggml_tensor * view_k_dst = ggml_view_2d ( ctx0 , kv_self . k_l [ il ] ,
n_embd_k_gqa , nm ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa ) ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa * id ) ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
ggml_tensor * view_v_src ;
ggml_tensor * view_v_dst ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
if ( flash_attn ) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d ( ctx0 , kv_self . v_l [ il ] ,
n_embd_v_gqa , nm ,
ggml_row_size ( kv_self . v_l [ il ] - > type , n_embd_v_gqa ) ,
ggml_row_size ( kv_self . v_l [ il ] - > type , n_embd_v_gqa * i ) ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
view_v_dst = ggml_view_2d ( ctx0 , kv_self . v_l [ il ] ,
n_embd_v_gqa , nm ,
ggml_row_size ( kv_self . v_l [ il ] - > type , n_embd_v_gqa ) ,
ggml_row_size ( kv_self . v_l [ il ] - > type , n_embd_v_gqa * id ) ) ;
} else {
view_v_src = ggml_view_2d ( ctx0 , kv_self . v_l [ il ] ,
nm , n_embd_v_gqa ,
ggml_row_size ( kv_self . v_l [ il ] - > type , kv_self . size ) ,
ggml_row_size ( kv_self . v_l [ il ] - > type , i ) ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
view_v_dst = ggml_view_2d ( ctx0 , kv_self . v_l [ il ] ,
nm , n_embd_v_gqa ,
ggml_row_size ( kv_self . v_l [ il ] - > type , kv_self . size ) ,
ggml_row_size ( kv_self . v_l [ il ] - > type , id ) ) ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , view_k_src , view_k_dst ) ) ;
ggml_build_forward_expand ( gf , ggml_cpy ( ctx0 , view_v_src , view_v_dst ) ) ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
i + = nm - 1 ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_pos ( ) {
lctx . inp_pos = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
cb ( lctx . inp_pos , " inp_pos " , - 1 ) ;
ggml_set_input ( lctx . inp_pos ) ;
return lctx . inp_pos ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_rope_factors ( int il ) {
// choose long/short freq factors based on the context size
const auto n_ctx_pre_seq = cparams . n_ctx / cparams . n_seq_max ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . rope_freqs ! = nullptr ) {
return model . layers [ il ] . rope_freqs ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
if ( n_ctx_pre_seq > hparams . n_ctx_orig_yarn ) {
return model . layers [ il ] . rope_long ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
return model . layers [ il ] . rope_short ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_out_ids ( ) {
lctx . inp_out_ids = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_outputs ) ;
cb ( lctx . inp_out_ids , " inp_out_ids " , - 1 ) ;
ggml_set_input ( lctx . inp_out_ids ) ;
return lctx . inp_out_ids ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_KQ_mask ( bool causal = true ) {
lctx . inp_KQ_mask = causal
? ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_kv , GGML_PAD ( n_tokens , GGML_KQ_MASK_PAD ) )
: ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_tokens , GGML_PAD ( n_tokens , GGML_KQ_MASK_PAD ) ) ;
cb ( lctx . inp_KQ_mask , " KQ_mask " , - 1 ) ;
ggml_set_input ( lctx . inp_KQ_mask ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
return flash_attn ? ggml_cast ( ctx0 , lctx . inp_KQ_mask , GGML_TYPE_F16 ) : lctx . inp_KQ_mask ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_KQ_mask_swa ( bool causal = true ) {
GGML_ASSERT ( hparams . n_swa > 0 ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
lctx . inp_KQ_mask_swa = causal
? ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_kv , GGML_PAD ( n_tokens , GGML_KQ_MASK_PAD ) )
: ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_tokens , GGML_PAD ( n_tokens , GGML_KQ_MASK_PAD ) ) ;
cb ( lctx . inp_KQ_mask_swa , " KQ_mask_swa " , - 1 ) ;
ggml_set_input ( lctx . inp_KQ_mask_swa ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
return flash_attn ? ggml_cast ( ctx0 , lctx . inp_KQ_mask_swa , GGML_TYPE_F16 ) : lctx . inp_KQ_mask_swa ;
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_mean ( ) {
lctx . inp_mean = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_tokens , n_tokens ) ;
cb ( lctx . inp_mean , " inp_mean " , - 1 ) ;
ggml_set_input ( lctx . inp_mean ) ;
return lctx . inp_mean ;
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_cls ( ) {
lctx . inp_cls = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens ) ;
cb ( lctx . inp_cls , " inp_cls " , - 1 ) ;
ggml_set_input ( lctx . inp_cls ) ;
return lctx . inp_cls ;
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_s_copy ( ) {
lctx . inp_s_copy = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_kv ) ;
cb ( lctx . inp_s_copy , " inp_s_copy " , - 1 ) ;
ggml_set_input ( lctx . inp_s_copy ) ;
return lctx . inp_s_copy ;
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * build_inp_s_mask ( ) {
lctx . inp_s_mask = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , 1 , n_kv ) ;
cb ( lctx . inp_s_mask , " inp_s_mask " , - 1 ) ;
ggml_set_input ( lctx . inp_s_mask ) ;
return lctx . inp_s_mask ;
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * append_pooling ( struct ggml_cgraph * gf ) {
// find result_norm tensor for input
struct ggml_tensor * inp = nullptr ;
for ( int i = ggml_graph_n_nodes ( gf ) - 1 ; i > = 0 ; - - i ) {
inp = ggml_graph_node ( gf , i ) ;
if ( strcmp ( inp - > name , " result_norm " ) = = 0 | | strcmp ( inp - > name , " result_embd " ) = = 0 ) {
break ;
} else {
inp = nullptr ;
}
}
GGML_ASSERT ( inp ! = nullptr & & " missing result_norm/result_embd tensor " ) ;
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
switch ( pooling_type ) {
case LLAMA_POOLING_TYPE_NONE :
{
cur = inp ;
2024-10-30 02:01:23 +01:00
} break ;
2025-01-03 10:18:53 +02:00
case LLAMA_POOLING_TYPE_MEAN :
2023-09-15 00:32:10 +08:00
{
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inp_mean = build_inp_mean ( ) ;
cur = ggml_mul_mat ( ctx0 , ggml_cont ( ctx0 , ggml_transpose ( ctx0 , inp ) ) , inp_mean ) ;
} break ;
case LLAMA_POOLING_TYPE_CLS :
case LLAMA_POOLING_TYPE_LAST :
{
struct ggml_tensor * inp_cls = build_inp_cls ( ) ;
cur = ggml_get_rows ( ctx0 , inp , inp_cls ) ;
} break ;
case LLAMA_POOLING_TYPE_RANK :
{
struct ggml_tensor * inp_cls = build_inp_cls ( ) ;
inp = ggml_get_rows ( ctx0 , inp , inp_cls ) ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
GGML_ASSERT ( model . cls ! = nullptr ) ;
GGML_ASSERT ( model . cls_b ! = nullptr ) ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . cls , inp ) , model . cls_b ) ;
cur = ggml_tanh ( ctx0 , cur ) ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2025-01-03 10:18:53 +02:00
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if ( model . cls_out ) {
GGML_ASSERT ( model . cls_out_b ! = nullptr ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , ggml_mul_mat ( ctx0 , model . cls_out , cur ) , model . cls_out_b ) ;
2023-08-23 23:08:04 +03:00
}
} break ;
2025-01-03 10:18:53 +02:00
default :
2023-08-23 23:08:04 +03:00
{
2025-01-03 10:18:53 +02:00
GGML_ABORT ( " unknown pooling type " ) ;
}
}
2024-03-18 12:49:02 -04:00
2025-01-03 10:18:53 +02:00
cb ( cur , " result_embd_pooled " , - 1 ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * llm_build_pos_bucket ( bool causal ) {
if ( causal ) {
lctx . inp_pos_bucket = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_I32 , n_kv , n_tokens ) ;
} else {
lctx . inp_pos_bucket = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_I32 , n_tokens , n_tokens ) ;
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
ggml_set_input ( lctx . inp_pos_bucket ) ;
cb ( lctx . inp_pos_bucket , " pos_bucket " , - 1 ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
return lctx . inp_pos_bucket ;
}
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * llm_build_pos_bias ( struct ggml_tensor * pos_bucket , struct ggml_tensor * attn_rel_b ) {
struct ggml_tensor * pos_bucket_1d = ggml_view_1d ( ctx0 , pos_bucket , pos_bucket - > ne [ 0 ] * pos_bucket - > ne [ 1 ] , 0 ) ;
cb ( pos_bucket_1d , " pos_bucket_1d " , - 1 ) ;
2024-05-18 10:04:55 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * pos_bias = ggml_get_rows ( ctx0 , attn_rel_b , pos_bucket_1d ) ;
cb ( pos_bias , " pos_bias " , - 1 ) ;
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
pos_bias = ggml_view_3d ( ctx0 , pos_bias , pos_bias - > ne [ 0 ] , lctx . inp_pos_bucket - > ne [ 0 ] , lctx . inp_pos_bucket - > ne [ 1 ] , ggml_element_size ( pos_bias ) * pos_bias - > ne [ 0 ] , ggml_element_size ( pos_bias ) * pos_bias - > ne [ 0 ] * lctx . inp_pos_bucket - > ne [ 0 ] , 0 ) ;
cb ( pos_bias , " pos_bias " , - 1 ) ;
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
pos_bias = ggml_permute ( ctx0 , pos_bias , 2 , 0 , 1 , 3 ) ;
cb ( pos_bias , " pos_bias " , - 1 ) ;
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
pos_bias = ggml_cont ( ctx0 , pos_bias ) ;
cb ( pos_bias , " pos_bias " , - 1 ) ;
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
return pos_bias ;
}
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * llm_build_inp_embd_enc ( ) {
const int64_t n_embd = hparams . n_embd ;
lctx . inp_embd_enc = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_embd , n_outputs_enc ) ;
ggml_set_input ( lctx . inp_embd_enc ) ;
cb ( lctx . inp_embd_enc , " embd_enc " , - 1 ) ;
return lctx . inp_embd_enc ;
}
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * llm_build_inp_KQ_mask_cross ( ) {
lctx . inp_KQ_mask_cross = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_outputs_enc , GGML_PAD ( n_tokens , GGML_KQ_MASK_PAD ) ) ;
ggml_set_input ( lctx . inp_KQ_mask_cross ) ;
cb ( lctx . inp_KQ_mask_cross , " KQ_mask_cross " , - 1 ) ;
return lctx . inp_KQ_mask_cross ;
}
2023-09-16 03:02:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_llama ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-02-13 12:03:53 -05:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
const float kq_scale = hparams . f_attention_scale = = 0.0f ? 1.0f / sqrtf ( float ( n_embd_head ) ) : hparams . f_attention_scale ;
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-02-13 12:03:53 -05:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , kq_scale , cb , il ) ;
}
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_residual_scale ) {
cur = ggml_scale ( ctx0 , cur , hparams . f_residual_scale ) ;
}
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
if ( model . layers [ il ] . ffn_gate_inp = = nullptr ) {
2025-01-08 15:59:53 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
model . layers [ il ] . ffn_gate , model . layers [ il ] . ffn_gate_b , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
} else {
// MoE branch
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_SILU , true ,
false , 0.0 ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
}
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_residual_scale ) {
cur = ggml_scale ( ctx0 , cur , hparams . f_residual_scale ) ;
}
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
2024-06-06 09:22:41 +02:00
2025-01-03 10:18:53 +02:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-05-11 09:46:09 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_logit_scale ) {
cur = ggml_scale ( ctx0 , cur , 1.0f / hparams . f_logit_scale ) ;
}
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_deci ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-10-10 22:48:21 +08:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-01-12 20:07:38 +01:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-02-22 18:15:13 +10:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-10-10 09:50:23 +02:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-02-22 18:15:13 +10:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-10-10 09:50:23 +02:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-02-22 18:15:13 +10:00
2025-01-03 10:18:53 +02:00
const float kq_scale = hparams . f_attention_scale = = 0.0f ? 1.0f / sqrtf ( float ( n_embd_head ) ) : hparams . f_attention_scale ;
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
const int64_t n_head_kv = hparams . n_head_kv ( il ) ;
const int64_t n_head = hparams . n_head ( il ) ;
2024-02-22 18:15:13 +10:00
2025-01-03 10:18:53 +02:00
if ( n_head = = 0 ) {
// attention-free layer of Llama-3_1-Nemotron-51B
cur = inpL ;
} else {
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
}
2023-12-27 22:39:45 +07:00
2025-01-03 10:18:53 +02:00
if ( n_head > 0 & & n_head_kv = = 0 ) {
// "linear attention" of Llama-3_1-Nemotron-51B
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wo , cur ) ;
cb ( cur , " wo " , il ) ;
} else if ( n_head > 0 ) {
// self-attention
// rope freq factors for llama3; may return nullptr for llama2 and other models
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
2024-04-04 02:05:10 +08:00
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2024-04-04 02:05:10 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , kq_scale , cb , il ) ;
}
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-01-22 06:21:52 -05:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_residual_scale ) {
cur = ggml_scale ( ctx0 , cur , hparams . f_residual_scale ) ;
}
2024-04-16 08:48:35 -07:00
2025-01-03 10:18:53 +02:00
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
struct ggml_tensor * ffn_inp = cur ;
if ( n_head > 0 ) {
ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
}
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
// feed-forward network
if ( model . layers [ il ] . ffn_gate_inp = = nullptr ) {
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
model . layers [ il ] . ffn_gate , model . layers [ il ] . ffn_gate_b , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_residual_scale ) {
cur = ggml_scale ( ctx0 , cur , hparams . f_residual_scale ) ;
}
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
// For Granite architecture
if ( hparams . f_logit_scale ) {
cur = ggml_scale ( ctx0 , cur , 1.0f / hparams . f_logit_scale ) ;
}
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cb ( cur , " result_output " , - 1 ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_baichuan ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
2025-01-12 11:32:42 +02:00
struct ggml_tensor * inp_pos = model . type = = LLM_TYPE_7B ? build_inp_pos ( ) : nullptr ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
// self-attention
{
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
switch ( model . type ) {
2025-01-12 11:32:42 +02:00
case LLM_TYPE_7B :
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
break ;
2025-01-12 11:32:42 +02:00
case LLM_TYPE_13B :
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd / n_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd / n_head , n_head , n_tokens ) ;
break ;
default :
GGML_ABORT ( " fatal error " ) ;
}
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// feed-forward network
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-01-13 13:44:37 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-01-13 13:44:37 +02:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-01-13 13:44:37 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_xverse ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-05-22 16:10:46 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
// self-attention
{
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
// feed-forward network
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams , model . output_norm , NULL , LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_falcon ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * attn_norm ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
attn_norm = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( attn_norm , " attn_norm " , il ) ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
// self-attention
{
if ( model . layers [ il ] . attn_norm_2 ) {
// Falcon-40B
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm_2 ,
model . layers [ il ] . attn_norm_2_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm_2 " , il ) ;
} else {
cur = attn_norm ;
}
2024-02-01 17:19:51 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-01-28 16:00:30 +08:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
// using mode = 2 for neox mode
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , nullptr , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , nullptr , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
attn_norm = ggml_get_rows ( ctx0 , attn_norm , inp_out_ids ) ;
}
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = cur ;
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
// feed forward
{
cur = llm_build_ffn ( ctx0 , lctx , attn_norm , // !! use the attn norm, not the result
model . layers [ il ] . ffn_up , NULL , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = ggml_add ( ctx0 , cur , inpL ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_grok ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
// multiply by embedding_multiplier_scale of 78.38367176906169
inpL = ggml_scale ( ctx0 , inpL , 78.38367176906169f ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f , cb , il ) ;
}
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// Grok
// if attn_out_norm is present then apply it before adding the input
if ( model . layers [ il ] . attn_out_norm ) {
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . attn_out_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_out_norm " , il ) ;
}
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// feed-forward network
// MoE branch
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
cur = llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_GELU , true ,
false , 0.0 ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// Grok
// if layer_out_norm is present then apply it before adding the input
// Idea: maybe ffn_out_norm is a better name
if ( model . layers [ il ] . layer_out_norm ) {
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . layer_out_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " layer_out_norm " , il ) ;
}
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-04-09 09:16:13 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
// Grok
// multiply logits by output_multiplier_scale of 0.5773502691896257
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
cur = ggml_scale ( ctx0 , cur , 0.5773502691896257f ) ;
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
cb ( cur , " result_output " , - 1 ) ;
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-11-19 01:04:08 -08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-11-19 01:04:08 -08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_dbrx ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-11-19 01:04:08 -08:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-11-19 01:04:08 -08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-09-15 23:47:37 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-09-15 23:47:37 -07:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-09-15 23:47:37 -07:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-09-15 23:47:37 -07:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-09-15 23:47:37 -07:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2024-09-15 23:47:37 -07:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
// self-attention
{
struct ggml_tensor * Qcur = nullptr ;
struct ggml_tensor * Kcur = nullptr ;
struct ggml_tensor * Vcur = nullptr ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
cur = ggml_clamp ( ctx0 , cur , - hparams . f_clamp_kqv , hparams . f_clamp_kqv ) ;
cb ( cur , " wqkv_clamped " , il ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
// MoE branch
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . attn_out_norm , NULL ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_out_norm " , il ) ;
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_SILU , true ,
false , 0.0 ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
2024-05-23 11:49:53 +02:00
2025-01-03 10:18:53 +02:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
cb ( cur , " result_output " , - 1 ) ;
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-05-24 14:31:13 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_starcoder ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * pos = ggml_get_rows ( ctx0 , model . pos_embd , inp_pos ) ;
cb ( pos , " pos_embd " , - 1 ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
inpL = ggml_add ( ctx0 , inpL , pos ) ;
cb ( inpL , " inpL " , - 1 ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
// self-attention
{
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-12-16 00:02:46 +07:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
}
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
// add the input
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
// FF
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
return gf ;
}
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_refact ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-06-24 02:27:57 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-06-24 02:27:57 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-06-24 02:27:57 +08:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-07-05 10:15:24 +03:00
2025-01-03 10:18:53 +02:00
// self-attention
{
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_bert ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
struct ggml_tensor * inp_pos = nullptr ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
if ( model . arch ! = LLM_ARCH_JINA_BERT_V2 ) {
inp_pos = build_inp_pos ( ) ;
}
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
// construct input embeddings (token, type, position)
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
// token types are hardcoded to zero ("Sentence A")
struct ggml_tensor * type_row0 = ggml_view_1d ( ctx0 , model . type_embd , n_embd , 0 ) ;
inpL = ggml_add ( ctx0 , inpL , type_row0 ) ;
if ( model . arch = = LLM_ARCH_BERT ) {
inpL = ggml_add ( ctx0 , ggml_get_rows ( ctx0 , model . pos_embd , inp_pos ) , inpL ) ;
}
cb ( inpL , " inp_embd " , - 1 ) ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
// embed layer norm
inpL = llm_build_norm ( ctx0 , inpL , hparams , model . tok_norm , model . tok_norm_b , LLM_NORM , cb , - 1 ) ;
cb ( inpL , " inp_norm " , - 1 ) ;
2024-07-02 10:36:00 -04:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( false ) ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
// iterate layers
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * cur = inpL ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur ;
struct ggml_tensor * Kcur ;
struct ggml_tensor * Vcur ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
// self-attention
if ( model . arch = = LLM_ARCH_BERT | | model . arch = = LLM_ARCH_JINA_BERT_V2 ) {
Qcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . attn_q_norm ) {
Qcur = llm_build_norm ( ctx0 , Qcur , hparams ,
model . layers [ il ] . attn_q_norm ,
model . layers [ il ] . attn_q_norm_b ,
LLM_NORM , cb , il ) ;
}
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . attn_k_norm ) {
Kcur = llm_build_norm ( ctx0 , Kcur , hparams ,
model . layers [ il ] . attn_k_norm ,
model . layers [ il ] . attn_k_norm_b ,
LLM_NORM , cb , il ) ;
}
Vcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
} else {
// compute Q and K and RoPE them
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
struct ggml_tensor * k = ggml_cont ( ctx0 , ggml_permute ( ctx0 , Kcur , 0 , 2 , 1 , 3 ) ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * kq = ggml_mul_mat ( ctx0 , k , q ) ;
cb ( kq , " kq " , il ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
kq = ggml_soft_max_ext ( ctx0 , kq , KQ_mask , 1.0f / sqrtf ( float ( n_embd_head ) ) , hparams . f_max_alibi_bias ) ;
cb ( kq , " kq_soft_max_ext " , il ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * v = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , Vcur , n_embd_gqa , n_tokens ) ) ) ;
cb ( v , " v " , il ) ;
2024-08-15 19:23:33 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * kqv = ggml_mul_mat ( ctx0 , ggml_reshape_3d ( ctx0 , v , n_tokens , n_embd_head , n_head_kv ) , kq ) ;
cb ( kqv , " kqv " , il ) ;
2024-08-16 15:35:18 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * kqv_merged = ggml_permute ( ctx0 , kqv , 0 , 2 , 1 , 3 ) ;
cb ( kqv_merged , " kqv_merged " , il ) ;
2024-08-16 15:35:18 +09:00
2025-01-03 10:18:53 +02:00
cur = ggml_cont_2d ( ctx0 , kqv_merged , n_embd_gqa , n_tokens ) ;
cb ( cur , " kqv_merged_cont " , il ) ;
2024-08-16 15:35:18 +09:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-08-16 15:35:18 +09:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wo , cur ) ;
if ( model . layers [ il ] . bo ) {
cb ( cur , " kqv_wo " , il ) ;
}
2024-08-16 15:35:18 +09:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bo ) {
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bo ) ;
}
cb ( cur , " kqv_out " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 & & pooling_type = = LLAMA_POOLING_TYPE_NONE ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// re-add the layer input
cur = ggml_add ( ctx0 , cur , inpL ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// attention layer norm
cur = llm_build_norm ( ctx0 , cur , hparams , model . layers [ il ] . attn_out_norm , model . layers [ il ] . attn_out_norm_b , LLM_NORM , cb , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . attn_norm_2 ! = nullptr ) {
cur = ggml_add ( ctx0 , cur , inpL ) ; // re-add the layer input
cur = llm_build_norm ( ctx0 , cur , hparams , model . layers [ il ] . attn_norm_2 , model . layers [ il ] . attn_norm_2_b , LLM_NORM , cb , il ) ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = cur ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// feed-forward network
if ( model . arch = = LLM_ARCH_BERT ) {
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
} else if ( model . arch = = LLM_ARCH_JINA_BERT_V2 ) {
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_PAR , cb , il ) ;
} else {
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
}
cb ( cur , " ffn_out " , il ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// attentions bypass the intermediate layer
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// output layer norm
cur = llm_build_norm ( ctx0 , cur , hparams , model . layers [ il ] . layer_out_norm , model . layers [ il ] . layer_out_norm_b , LLM_NORM , cb , il ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
cb ( cur , " result_embd " , - 1 ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-09-28 12:08:43 +00:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_bloom ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-09-28 12:08:43 +00:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-09-28 12:08:43 +00:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-09-28 12:08:43 +00:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-09-28 12:08:43 +00:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-09-28 12:08:43 +00:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_norm ( ctx0 , inpL , hparams ,
model . tok_norm ,
model . tok_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( inpL , " inp_norm " , - 1 ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
{
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
}
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
// Add the input
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
// FF
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_mpt ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * pos ;
struct ggml_tensor * inpL ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-12-18 19:27:21 +02:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
if ( model . pos_embd ) {
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
pos = ggml_get_rows ( ctx0 , model . pos_embd , inp_pos ) ;
cb ( pos , " pos_embd " , - 1 ) ;
inpL = ggml_add ( ctx0 , inpL , pos ) ;
cb ( inpL , " inpL " , - 1 ) ;
2024-10-30 02:01:23 +01:00
}
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * attn_norm ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
attn_norm = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( attn_norm , " attn_norm " , il ) ;
2023-12-21 21:07:46 +01:00
2025-01-03 10:18:53 +02:00
// self-attention
{
cur = attn_norm ;
2024-03-22 19:00:01 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2023-12-21 21:07:46 +01:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bqkv ) {
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
}
2024-03-22 19:00:01 +01:00
2025-01-03 10:18:53 +02:00
if ( hparams . f_clamp_kqv > 0.0f ) {
cur = ggml_clamp ( ctx0 , cur , - hparams . f_clamp_kqv , hparams . f_clamp_kqv ) ;
cb ( cur , " wqkv_clamped " , il ) ;
}
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2023-12-21 21:07:46 +01:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-10-07 18:27:51 +03:00
2025-01-03 10:18:53 +02:00
// Q/K Layernorm
if ( model . layers [ il ] . attn_q_norm ) {
Qcur = llm_build_norm ( ctx0 , Qcur , hparams ,
model . layers [ il ] . attn_q_norm ,
model . layers [ il ] . attn_q_norm_b ,
LLM_NORM , cb , il ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-03-22 19:00:01 +01:00
2025-01-03 10:18:53 +02:00
Kcur = llm_build_norm ( ctx0 , Kcur , hparams ,
model . layers [ il ] . attn_k_norm ,
model . layers [ il ] . attn_k_norm_b ,
LLM_NORM , cb , il ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-03-22 19:00:01 +01:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-03-22 19:00:01 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
} else {
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
2023-12-21 21:07:46 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
}
2023-06-06 22:41:53 +03:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
}
2023-07-05 08:58:05 +02:00
2025-01-03 10:18:53 +02:00
// Add the input
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-07-05 08:58:05 +02:00
2025-01-03 10:18:53 +02:00
// feed forward
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
model . layers [ il ] . ffn_act ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2023-05-13 15:38:36 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2023-12-21 21:07:46 +01:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
2024-01-12 20:07:38 +01:00
}
2023-05-13 15:38:36 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
cuda : loading models directly into VRAM, norm calculation on GPU, broadcasting for ggml_mul (#1483)
* Broadcasting for ggml_mul
* CUDA kernel for ggml_mul, norms in VRAM
* GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* fixup! GPU weights not in RAM, direct loading with cuFile
* define default model path once, sync path with readme (#1366)
* ~7% faster Q5_1 AVX2 code (#1477)
* convert.py: Support models which are stored in a single pytorch_model.bin (#1469)
* Support models in a single pytorch_model.bin
* Remove spurious line with typo
* benchmark-matmul: Print the average of the test results (#1490)
* Remove unused n_parts parameter (#1509)
* Fixes #1511 lambda issue for w64devkit (mingw) (#1513)
* Fix for w64devkit and mingw
* make kv_f16 the default for api users (#1517)
* minor : fix compile warnings
* readme : adds WizardLM to the list of supported models (#1485)
* main : make reverse prompt option act as a stop token in non-interactive mode (#1032)
* Make reverse prompt option act as a stop token in non-interactive scenarios
* Making requested review changes
* Update gpt_params_parse and fix a merge error
* Revert "Update gpt_params_parse and fix a merge error"
This reverts commit 2bb2ff1748513591ad45b175a75ed1d8089d84c8.
* Update gpt_params_parse and fix a merge error take 2
* examples : add persistent chat (#1495)
* examples : add persistent chat
* examples : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* tests : add missing header
* ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0
* llama : bump LLAMA_FILE_VERSION to 3
* cuda : update Q4 and Q8 dequantize kernels
* ggml : fix AVX dot products
* readme : update performance table + hot topics
* ggml : fix scalar implementation of Q4_1 dot
* llama : fix compile warnings in llama_set_state_data()
* llama : fix name shadowing and C4146 (#1526)
* Fix name shadowing and C4146
* Fix if macros not using defined when required
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Update llama-util.h
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
* Code style
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Fix for mingw (#1462)
* llama : add llama_init_backend() API (close #1527)
* feature : add blis and other BLAS implementation support (#1502)
* feature: add blis support
* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927
* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake
* Fix typo in INTEGER
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "feature : add blis and other BLAS implementation support (#1502)"
This reverts commit 07e9ace0f9da424d82e75df969642522880feb92.
* GPU weights not in RAM, direct loading with cuFile
* llama : code style fixes + progress print fix
* ggml : ggml_mul better broadcast support
* cmake : workarounds for cufile when CMake version < 3.25
* gg rebase fixup
* Loop in llama.cpp, fixed progress callback
* Attempt clang-tidy fix
* llama : fix vram size computation
* Add forgotten fclose()
---------
Co-authored-by: András Salamon <ott2@users.noreply.github.com>
Co-authored-by: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Co-authored-by: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Co-authored-by: rankaiyx <rankaiyx@rankaiyx.com>
Co-authored-by: Stephan Walter <stephan@walter.name>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
Co-authored-by: Erik Scholz <Green-Sky@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: David Kennedy <dakennedyd@gmail.com>
Co-authored-by: Jason McCartney <jmac@theroot.org>
Co-authored-by: Evan Jones <evan.q.jones@gmail.com>
Co-authored-by: Maxime <672982+maximegmd@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Zenix <zenixls2@gmail.com>
2023-05-20 14:19:28 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-09-13 09:53:38 +03:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_stablelm ( ) {
struct ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2023-09-28 21:42:38 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2024-09-13 09:53:38 +03:00
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = cur ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
cb ( Qcur , " Qcur " , il ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
cb ( Kcur , " Kcur " , il ) ;
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . attn_q_norm ) {
Qcur = llm_build_norm ( ctx0 , Qcur , hparams ,
model . layers [ il ] . attn_q_norm ,
NULL ,
LLM_NORM , cb , il ) ;
cb ( Qcur , " Qcur " , il ) ;
}
if ( model . layers [ il ] . attn_k_norm ) {
Kcur = llm_build_norm ( ctx0 , Kcur , hparams ,
model . layers [ il ] . attn_k_norm ,
NULL ,
LLM_NORM , cb , il ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-09-17 00:44:58 -06:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
// feed-forward network
{
if ( model . layers [ il ] . ffn_norm ) {
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
} else {
// parallel residual
cur = inpSA ;
}
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
return gf ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_qwen ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-07-15 20:50:47 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-07-15 20:50:47 +02:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
{
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 2 * sizeof ( float ) * ( n_embd ) ) ) ;
2024-06-26 14:27:46 +08:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-06-26 14:27:46 +08:00
2025-01-03 10:18:53 +02:00
// using mode = 2 for neox mode
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , nullptr , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-06-26 14:27:46 +08:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , nullptr , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-07-07 20:52:10 +08:00
2025-01-03 10:18:53 +02:00
// feed-forward forward
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-06-26 14:27:46 +08:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-06-26 14:27:46 +08:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_qwen2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
2024-04-18 15:18:48 +02:00
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-04-18 15:18:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-07-15 20:50:47 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
return gf ;
}
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_qwen2vl ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
lctx . inp_pos = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , n_tokens * 4 ) ;
cb ( lctx . inp_pos , " inp_pos " , - 1 ) ;
ggml_set_input ( lctx . inp_pos ) ;
struct ggml_tensor * inp_pos = lctx . inp_pos ;
2024-06-29 20:44:08 -07:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
int sections [ 4 ] ;
std : : copy ( std : : begin ( hparams . rope_sections ) , std : : begin ( hparams . rope_sections ) + 4 , sections ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_multi (
ctx0 ,
ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , sections , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-01-20 16:05:49 +01:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_multi (
ctx0 ,
ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , sections , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-06-24 02:27:57 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-01-20 16:05:49 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2024-01-20 16:05:49 +01:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-01-20 16:05:49 +01:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-03-04 22:31:20 +02:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-01-20 16:05:49 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-01-20 16:05:49 +01:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_qwen2moe ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// self_attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// MoE branch
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
ggml_tensor * moe_out =
llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_SILU , false ,
false , 0.0 ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// FFN shared expert
{
ggml_tensor * cur_gate_inp = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . ffn_gate_inp_shexp , cur ) ;
cb ( cur_gate_inp , " ffn_shexp_gate_inp " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// sigmoid
ggml_tensor * cur_gate = ggml_div ( ctx0 , ggml_silu ( ctx0 , cur_gate_inp ) , cur_gate_inp ) ;
cb ( cur_gate , " ffn_shexp_gate " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
ggml_tensor * cur_ffn = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up_shexp , NULL , NULL ,
model . layers [ il ] . ffn_gate_shexp , NULL , NULL ,
model . layers [ il ] . ffn_down_shexp , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur_ffn , " ffn_shexp " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
ggml_tensor * ffn_shexp_out = ggml_mul ( ctx0 , cur_ffn , cur_gate ) ;
cb ( ffn_shexp_out , " ffn_shexp_out " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
moe_out = ggml_add ( ctx0 , moe_out , ffn_shexp_out ) ;
cb ( moe_out , " ffn_out " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
cur = moe_out ;
}
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
return gf ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_phi2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * attn_norm_output ;
struct ggml_tensor * ffn_output ;
struct ggml_tensor * inpL ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
attn_norm_output = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( attn_norm_output , " attn_norm " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// self-attention
{
struct ggml_tensor * Qcur = nullptr ;
struct ggml_tensor * Kcur = nullptr ;
struct ggml_tensor * Vcur = nullptr ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . wqkv ) {
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , attn_norm_output ) ;
cb ( cur , " wqkv " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
} else {
Qcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , attn_norm_output ) , model . layers [ il ] . bq ) ;
Kcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , attn_norm_output ) , model . layers [ il ] . bk ) ;
Vcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , attn_norm_output ) , model . layers [ il ] . bv ) ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , nullptr , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// with phi2, we scale the Q to avoid precision issues
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
Qcur = ggml_scale ( ctx0 , Qcur , 1.0f / sqrtf ( float ( n_embd_head ) ) ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , nullptr , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f , cb , il ) ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
attn_norm_output = ggml_get_rows ( ctx0 , attn_norm_output , inp_out_ids ) ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// FF
{
ffn_output = llm_build_ffn ( ctx0 , lctx , attn_norm_output ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( ffn_output , " ffn_out " , il ) ;
}
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_output ) ;
cur = ggml_add ( ctx0 , cur , inpL ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-09-01 22:38:17 +08:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
llama : add custom RoPE (#2054)
* Implement customizable RoPE
The original RoPE has pre-defined parameters
theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2]
Our customizable RoPE, ggml_rope_custom_inplace, uses
theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]
with the default matches the original
scale = 1.0
base = 10000
The new command line arguments
--rope-freq-base
--rope-freq-scale
set the two new RoPE parameter.
Recent researches show changing these two parameters extends the context limit with minimal loss.
1. Extending Context to 8K
kaiokendev
https://kaiokendev.github.io/til#extending-context-to-8k
2. Extending Context Window of Large Language Models via Positional Interpolation
Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian
https://arxiv.org/abs/2306.15595
3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation.
https://www.reddit.com/user/bloc97
https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
For the bold, try adding the following command line parameters to your favorite model:
-c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5
* ggml-metal: fix custom rope
* common: fix argument names in help
* llama: increase MEM_REQ_EVAL for MODEL_3B
It avoids crashing for quantized weights on CPU.
Better ways to calculate the required buffer size would be better.
* llama: make MEM_REQ_EVAL depend on n_ctx
* server: use proper Content-Type in curl examples
Without the header Content-Type: application/json, curl will POST with
Content-Type: application/x-www-form-urlencoded
Though our simple server doesn't care, the httplib.h used has a limit
with CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
With Content-Type: application/json, we can send large json data.
* style : minor fixes, mostly indentations
* ggml : fix asserts
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-15 06:34:16 -04:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output_no_bias " , - 1 ) ;
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . output_b ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_phi3 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-06-06 21:33:23 +02:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-06-14 19:47:19 +02:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = nullptr ;
if ( hparams . n_swa = = 0 ) {
// Phi-4 doesn't use sliding window attention
KQ_mask = build_inp_KQ_mask ( ) ;
} else {
KQ_mask = build_inp_KQ_mask_swa ( ) ;
2023-11-01 20:11:02 +02:00
}
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
auto residual = inpL ;
2023-06-14 19:47:19 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// rope freq factors for 128k context
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * attn_norm_output = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
2025-01-09 11:21:41 +01:00
model . layers [ il ] . attn_norm_b ,
2025-01-03 10:18:53 +02:00
LLM_NORM_RMS , cb , il ) ;
cb ( attn_norm_output , " attn_norm " , il ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = nullptr ;
struct ggml_tensor * Kcur = nullptr ;
struct ggml_tensor * Vcur = nullptr ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . wqkv ) {
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , attn_norm_output ) ;
cb ( cur , " wqkv " , il ) ;
2024-09-24 03:14:24 +03:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2025-01-09 11:21:41 +01:00
} else {
2025-01-03 10:18:53 +02:00
Qcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , attn_norm_output ) , model . layers [ il ] . bq ) ;
Kcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , attn_norm_output ) , model . layers [ il ] . bk ) ;
Vcur = ggml_add ( ctx0 , llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , attn_norm_output ) , model . layers [ il ] . bv ) ;
2024-09-24 03:14:24 +03:00
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , rope_factors , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_scale ( ctx0 , Qcur , 1.0f / sqrtf ( float ( n_embd_head ) ) ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , rope_factors , n_rot , rope_type , n_ctx_orig ,
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f , cb , il ) ;
}
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
residual = ggml_get_rows ( ctx0 , residual , inp_out_ids ) ;
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , residual ) ;
residual = cur ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-09 11:21:41 +01:00
model . layers [ il ] . ffn_norm , model . layers [ il ] . ffn_norm_b ,
2025-01-03 10:18:53 +02:00
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-02-25 22:12:24 +02:00
2025-01-09 11:21:41 +01:00
// feed-forward network
if ( model . layers [ il ] . ffn_gate_inp = = nullptr ) {
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SWIGLU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2025-01-09 11:21:41 +01:00
} else {
// MoE branch
cur = llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
nullptr ,
n_expert , n_expert_used ,
LLM_FFN_SILU , true ,
false , 0.0 ,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
2025-01-03 10:18:53 +02:00
}
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , residual , cur ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
2025-01-09 11:21:41 +01:00
model . output_norm_b ,
2025-01-03 10:18:53 +02:00
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2025-01-09 11:21:41 +01:00
if ( model . output_b ! = nullptr ) {
cb ( cur , " result_output_no_bias " , - 1 ) ;
cur = ggml_add ( ctx0 , cur , model . output_b ) ;
}
2025-01-03 10:18:53 +02:00
cb ( cur , " result_output " , - 1 ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-02-27 14:35:51 +02:00
2024-02-25 22:12:24 +02:00
return gf ;
}
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_plamo ( ) {
struct ggml_cgraph * gf = ggml_new_graph ( ctx0 ) ;
2024-05-22 04:28:32 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-07-27 05:03:45 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-05-22 04:28:32 +08:00
2025-01-03 10:18:53 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-05-22 04:28:32 +08:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-07-01 18:48:34 +02:00
2025-01-03 10:18:53 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-07-01 18:48:34 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * attention_norm = cur ;
2024-07-01 18:48:34 +02:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-07-01 18:48:34 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_rot , n_head , n_tokens ) , inp_pos , nullptr ,
n_embd_head , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_rot , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_embd_head , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-06-21 00:38:22 -05:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * sa_out = cur ;
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
cur = attention_norm ;
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
sa_out = ggml_get_rows ( ctx0 , sa_out , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
}
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
// feed-forward network
{
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-06-21 00:38:22 -05:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , sa_out ) ;
cur = ggml_add ( ctx0 , cur , inpL ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-06-21 00:38:22 -05:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
2024-07-04 15:46:11 +02:00
}
2025-01-03 10:18:53 +02:00
cur = inpL ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
return gf ;
2024-07-04 15:46:11 +02:00
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_gpt2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-04-05 22:07:33 +03:00
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
2025-01-03 10:18:53 +02:00
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
2024-01-02 03:51:28 -08:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2023-06-14 19:47:19 +02:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * pos ;
2023-11-01 20:11:02 +02:00
struct ggml_tensor * inpL ;
2023-08-23 23:08:04 +03:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
pos = ggml_get_rows ( ctx0 , model . pos_embd , inp_pos ) ;
cb ( pos , " pos_embd " , - 1 ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
inpL = ggml_add ( ctx0 , inpL , pos ) ;
cb ( inpL , " inpL " , - 1 ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " attn_norm " , il ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
// self-attention
{
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
cb ( Vcur , " Vcur " , il ) ;
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
2023-08-23 23:08:04 +03:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2023-12-02 02:17:06 +08:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2025-01-03 10:18:53 +02:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-11-01 20:11:02 +02:00
}
2023-09-15 00:32:10 +08:00
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
2024-09-17 00:44:58 -06:00
}
2025-01-03 10:18:53 +02:00
// add the input
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
2023-11-01 20:11:02 +02:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
// FF
{
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " ffn_norm " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
2025-01-03 10:18:53 +02:00
NULL , NULL , NULL ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
2023-12-27 22:39:45 +07:00
NULL ,
2025-01-03 10:18:53 +02:00
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " ffn_out " , il ) ;
2024-09-17 00:44:58 -06:00
}
2023-11-01 20:11:02 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " l_out " , il ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
// input for next layer
inpL = cur ;
}
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_norm " , - 1 ) ;
2023-09-28 19:04:36 +03:00
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
return gf ;
}
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_codeshell ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-12-23 08:22:33 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
2025-01-03 10:18:53 +02:00
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
2024-12-23 08:22:33 +08:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
// self-attention
{
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * tmpq = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * tmpk = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
cb ( tmpq , " tmpq " , il ) ;
cb ( tmpk , " tmpk " , il ) ;
2024-12-23 08:22:33 +08:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , tmpq , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
2024-12-23 08:22:33 +08:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , tmpk , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
2024-12-23 08:22:33 +08:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2025-01-03 10:18:53 +02:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-12-23 08:22:33 +08:00
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
2024-12-23 08:22:33 +08:00
}
2025-01-03 10:18:53 +02:00
// add the input
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-12-23 08:22:33 +08:00
2025-01-03 10:18:53 +02:00
// FF
{
2024-12-23 08:22:33 +08:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
2024-12-23 08:22:33 +08:00
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
2025-01-03 10:18:53 +02:00
NULL , NULL , NULL ,
2024-12-23 08:22:33 +08:00
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
2025-01-03 10:18:53 +02:00
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
2024-12-23 08:22:33 +08:00
cb ( cur , " ffn_out " , il ) ;
}
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
2024-12-23 08:22:33 +08:00
cb ( cur , " result_norm " , - 1 ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_orion ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-09-15 00:32:10 +08:00
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2024-01-12 13:01:56 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-01-02 03:51:28 -08:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-09-15 00:32:10 +08:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
// inp_pos - contains the positions
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-09-28 19:04:36 +03:00
2023-11-01 20:11:02 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2023-09-15 00:32:10 +08:00
2025-01-03 10:18:53 +02:00
// norm
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm , model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " attn_norm " , il ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
// self-attention
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
// if (model.layers[il].bq) {
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
// cb(Qcur, "Qcur", il);
// }
2023-11-01 20:11:02 +02:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
// if (model.layers[il].bk) {
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
// cb(Kcur, "Kcur", il);
// }
2023-11-01 20:11:02 +02:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
// if (model.layers[il].bv) {
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
// cb(Vcur, "Vcur", il);
// }
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2023-11-01 20:11:02 +02:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2023-11-01 20:11:02 +02:00
cb ( Kcur , " Kcur " , il ) ;
2023-09-15 00:32:10 +08:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2023-11-01 20:11:02 +02:00
model . layers [ il ] . wo , NULL ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-09-15 00:32:10 +08:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2023-11-01 20:11:02 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
// feed-forward network
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " l_out " , il ) ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
// input for next layer
inpL = cur ;
2023-09-15 00:32:10 +08:00
}
2023-11-01 20:11:02 +02:00
cur = inpL ;
2023-09-15 00:32:10 +08:00
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm , model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_norm " , - 1 ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
return gf ;
}
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_internlm2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-03-29 21:37:03 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-03-29 21:37:03 +08:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
2025-01-03 10:18:53 +02:00
// norm
2024-03-29 21:37:03 +08:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-03-29 21:37:03 +08:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2024-03-29 21:37:03 +08:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-03-29 21:37:03 +08:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-03-29 21:37:03 +08:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-03-29 21:37:03 +08:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-03-29 21:37:03 +08:00
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-03-29 21:37:03 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-03-29 21:37:03 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-03-29 21:37:03 +08:00
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
2024-03-29 21:37:03 +08:00
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// feed-forward network
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-03-29 21:37:03 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2024-03-29 21:37:03 +08:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2024-03-29 21:37:03 +08:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2024-03-29 21:37:03 +08:00
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-03-29 21:37:03 +08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_minicpm3 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256 ;
const float scale_embd = 12.0f ;
const float scale_depth = 1.4f ;
const float kq_scale = 1.0f / sqrtf ( float ( hparams . n_embd_head_k ) ) ;
const uint32_t n_embd_head_qk_rope = hparams . n_rot ;
const uint32_t n_embd_head_qk_nope = hparams . n_embd_head_k - hparams . n_rot ;
const uint32_t kv_lora_rank = hparams . n_lora_kv ;
2024-01-02 03:51:28 -08:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-10-04 06:23:39 -07:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// scale the input embeddings
inpL = ggml_scale ( ctx0 , inpL , scale_embd ) ;
cb ( inpL , " inp_scaled " , - 1 ) ;
2023-11-01 20:11:02 +02:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// self_attention
2023-11-01 20:11:02 +02:00
{
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q = NULL ;
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq_a , cur ) ;
cb ( q , " q " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
q = llm_build_norm ( ctx0 , q , hparams ,
model . layers [ il ] . attn_q_a_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( q , " q " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq_b , q ) ;
cb ( q , " q " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * q_nope = ggml_view_3d ( ctx0 , q , n_embd_head_qk_nope , n_head , n_tokens ,
ggml_row_size ( q - > type , hparams . n_embd_head_k ) ,
ggml_row_size ( q - > type , hparams . n_embd_head_k * n_head ) ,
0 ) ;
cb ( q_nope , " q_nope " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// and {n_head * n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * q_pe = ggml_view_3d ( ctx0 , q , n_embd_head_qk_rope , n_head , n_tokens ,
ggml_row_size ( q - > type , hparams . n_embd_head_k ) ,
ggml_row_size ( q - > type , hparams . n_embd_head_k * n_head ) ,
ggml_row_size ( q - > type , n_embd_head_qk_nope ) ) ;
cb ( q_pe , " q_pe " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat ( ctx0 , model . layers [ il ] . wkv_a_mqa , cur ) ;
cb ( kv_pe_compresseed , " kv_pe_compresseed " , il ) ;
// split into {kv_lora_rank, n_tokens}
struct ggml_tensor * kv_compressed = ggml_view_2d ( ctx0 , kv_pe_compresseed , kv_lora_rank , n_tokens ,
kv_pe_compresseed - > nb [ 1 ] ,
0 ) ;
cb ( kv_compressed , " kv_compressed " , il ) ;
// and {n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * k_pe = ggml_view_3d ( ctx0 , kv_pe_compresseed , n_embd_head_qk_rope , 1 , n_tokens ,
kv_pe_compresseed - > nb [ 1 ] ,
kv_pe_compresseed - > nb [ 1 ] ,
ggml_row_size ( kv_pe_compresseed - > type , kv_lora_rank ) ) ;
cb ( k_pe , " k_pe " , il ) ;
kv_compressed = ggml_cont ( ctx0 , kv_compressed ) ; // TODO: the CUDA backend does not support non-contiguous norm
kv_compressed = llm_build_norm ( ctx0 , kv_compressed , hparams ,
model . layers [ il ] . attn_kv_a_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( kv_compressed , " kv_compressed " , il ) ;
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
struct ggml_tensor * kv = ggml_mul_mat ( ctx0 , model . layers [ il ] . wkv_b , kv_compressed ) ;
cb ( kv , " kv " , il ) ;
// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * k_nope = ggml_view_3d ( ctx0 , kv , n_embd_head_qk_nope , n_head , n_tokens ,
ggml_row_size ( kv - > type , n_embd_head_qk_nope + hparams . n_embd_head_v ) ,
ggml_row_size ( kv - > type , n_head * ( n_embd_head_qk_nope + hparams . n_embd_head_v ) ) ,
0 ) ;
cb ( k_nope , " k_nope " , il ) ;
// and {n_head * n_embd_head_v, n_tokens}
struct ggml_tensor * v_states = ggml_view_3d ( ctx0 , kv , hparams . n_embd_head_v , n_head , n_tokens ,
ggml_row_size ( kv - > type , ( n_embd_head_qk_nope + hparams . n_embd_head_v ) ) ,
ggml_row_size ( kv - > type , ( n_embd_head_qk_nope + hparams . n_embd_head_v ) * n_head ) ,
ggml_row_size ( kv - > type , ( n_embd_head_qk_nope ) ) ) ;
cb ( v_states , " v_states " , il ) ;
v_states = ggml_cont ( ctx0 , v_states ) ;
cb ( v_states , " v_states " , il ) ;
v_states = ggml_view_2d ( ctx0 , v_states , hparams . n_embd_head_v * n_head , n_tokens ,
ggml_row_size ( kv - > type , hparams . n_embd_head_v * n_head ) ,
0 ) ;
cb ( v_states , " v_states " , il ) ;
2025-01-15 12:51:37 +01:00
q_pe = ggml_cont ( ctx0 , q_pe ) ; // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
2025-01-03 10:18:53 +02:00
q_pe = ggml_rope_ext (
ctx0 , q_pe , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
2023-11-01 18:04:33 -04:00
) ;
2025-01-03 10:18:53 +02:00
cb ( q_pe , " q_pe " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// shared RoPE key
2025-01-15 12:51:37 +01:00
k_pe = ggml_cont ( ctx0 , k_pe ) ; // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
2025-01-03 10:18:53 +02:00
k_pe = ggml_rope_ext (
ctx0 , k_pe , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
2023-11-01 18:04:33 -04:00
) ;
2025-01-03 10:18:53 +02:00
cb ( k_pe , " k_pe " , il ) ;
struct ggml_tensor * q_states = ggml_concat ( ctx0 , q_nope , q_pe , 0 ) ;
cb ( q_states , " q_states " , il ) ;
struct ggml_tensor * k_states = ggml_concat ( ctx0 , k_nope , ggml_repeat ( ctx0 , k_pe , q_pe ) , 0 ) ;
cb ( k_states , " k_states " , il ) ;
2023-10-04 06:23:39 -07:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2023-11-01 20:11:02 +02:00
model . layers [ il ] . wo , NULL ,
2025-01-03 10:18:53 +02:00
k_states , v_states , q_states , KQ_mask , n_tokens , kv_head , n_kv , kq_scale , cb , il ) ;
2023-11-01 08:04:02 +02:00
}
2023-10-04 06:23:39 -07:00
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
// scale_res - scale the hidden states for residual connection
const float scale_res = scale_depth / sqrtf ( float ( n_layer ) ) ;
cur = ggml_scale ( ctx0 , cur , scale_res ) ;
cb ( cur , " hidden_scaled " , il ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// feed-forward network
2023-11-01 20:11:02 +02:00
{
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_up , NULL , NULL ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_gate , NULL , NULL ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_down , NULL , NULL ,
2023-12-27 22:39:45 +07:00
NULL ,
2025-01-03 10:18:53 +02:00
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " ffn_out " , il ) ;
}
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// scale the hidden states for residual connection
cur = ggml_scale ( ctx0 , cur , scale_res ) ;
cb ( cur , " hidden_scaled_ffn " , il ) ;
2023-11-01 20:11:02 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " l_out " , il ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
// input for next layer
inpL = cur ;
2023-10-04 06:23:39 -07:00
}
2023-11-01 20:11:02 +02:00
cur = inpL ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_norm " , - 1 ) ;
2023-10-04 06:23:39 -07:00
2025-01-03 10:18:53 +02:00
// lm_head scaling
const float scale_lmhead = float ( n_embd_base ) / float ( n_embd ) ;
cur = ggml_scale ( ctx0 , cur , scale_lmhead ) ;
cb ( cur , " lmhead_scaling " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-10-04 06:23:39 -07:00
2023-11-01 20:11:02 +02:00
return gf ;
2023-10-04 06:23:39 -07:00
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_gemma ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head_k = hparams . n_embd_head_k ;
2024-03-23 17:41:53 +01:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
inpL = ggml_scale ( ctx0 , inpL , sqrtf ( n_embd ) ) ;
cb ( inpL , " inp_scaled " , - 1 ) ;
2024-03-23 17:41:53 +01:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-03-23 17:41:53 +01:00
cb ( Qcur , " Qcur " , il ) ;
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-03-23 17:41:53 +01:00
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-03-23 17:41:53 +01:00
cb ( Vcur , " Vcur " , il ) ;
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head_k , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
2024-03-23 17:41:53 +01:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
Qcur = ggml_scale ( ctx0 , Qcur , 1.0f / sqrtf ( float ( n_embd_head_k ) ) ) ;
cb ( Qcur , " Qcur_scaled " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head_k , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
2024-03-23 17:41:53 +01:00
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , NULL ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f , cb , il ) ;
2024-03-23 17:41:53 +01:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
2024-03-23 17:41:53 +01:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * sa_out = ggml_add ( ctx0 , cur , inpL ) ;
cb ( sa_out , " sa_out " , il ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , sa_out , hparams ,
2024-03-23 17:41:53 +01:00
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
// feed-forward network
{
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , sa_out ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2024-04-13 11:33:52 +02:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-04-13 11:33:52 +02:00
cb ( cur , " result_output " , - 1 ) ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
return gf ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_gemma2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head_k = hparams . n_embd_head_k ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-03-23 17:41:53 +01:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
inpL = ggml_scale ( ctx0 , inpL , sqrtf ( n_embd ) ) ;
cb ( inpL , " inp_scaled " , - 1 ) ;
2024-04-13 11:33:52 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2025-01-03 10:18:53 +02:00
// gemma 2 requires different mask for layers using sliding window (SWA)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( true ) ;
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa ( true ) ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
// (il % 2) layers use SWA
struct ggml_tensor * KQ_mask_l = ( il % 2 = = 0 ) ? KQ_mask_swa : KQ_mask ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
2024-04-13 11:33:52 +02:00
cb ( cur , " attn_norm " , il ) ;
2024-03-23 17:41:53 +01:00
2024-04-13 11:33:52 +02:00
// self-attention
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-04-13 11:33:52 +02:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-04-13 11:33:52 +02:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-04-13 11:33:52 +02:00
cb ( Vcur , " Vcur " , il ) ;
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head_k , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
2024-04-13 11:33:52 +02:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
switch ( model . type ) {
2025-01-12 11:32:42 +02:00
case LLM_TYPE_2B :
case LLM_TYPE_9B : Qcur = ggml_scale ( ctx0 , Qcur , 1.0f / sqrtf ( float ( n_embd_head_k ) ) ) ; break ;
case LLM_TYPE_27B : Qcur = ggml_scale ( ctx0 , Qcur , 1.0f / sqrtf ( float ( n_embd / n_head ) ) ) ; break ;
2025-01-03 10:18:53 +02:00
default : GGML_ABORT ( " fatal error " ) ;
} ;
cb ( Qcur , " Qcur_scaled " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head_k , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow ) ;
2024-04-13 11:33:52 +02:00
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
model . layers [ il ] . wo , NULL ,
2025-01-03 10:18:53 +02:00
Kcur , Vcur , Qcur , KQ_mask_l , n_tokens , kv_head , n_kv , 1.0f , cb , il ) ;
2024-03-23 17:41:53 +01:00
}
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . attn_post_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_post_norm " , il ) ;
2024-04-13 11:33:52 +02:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
2024-04-13 11:33:52 +02:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * sa_out = ggml_add ( ctx0 , cur , inpL ) ;
cb ( sa_out , " sa_out " , il ) ;
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , sa_out , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-04-13 11:33:52 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
{
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . ffn_post_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " ffn_post_norm " , - 1 ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , sa_out ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2024-03-23 17:41:53 +01:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2024-03-23 17:41:53 +01:00
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-03-23 17:41:53 +01:00
2025-01-03 10:18:53 +02:00
// final logit soft-capping
cur = ggml_scale ( ctx0 , cur , 1.0f / hparams . f_final_logit_softcapping ) ;
cur = ggml_tanh ( ctx0 , cur ) ;
cur = ggml_scale ( ctx0 , cur , hparams . f_final_logit_softcapping ) ;
2024-03-23 17:41:53 +01:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_starcoder2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-08-23 23:08:04 +03:00
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-01-02 03:51:28 -08:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-08-23 23:08:04 +03:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-09-28 19:04:36 +03:00
2023-11-01 20:11:02 +02:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
// norm
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm , model . layers [ il ] . attn_norm_b ,
2023-11-01 20:11:02 +02:00
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-08-23 23:08:04 +03:00
2023-11-01 20:11:02 +02:00
// self-attention
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2023-11-01 20:11:02 +02:00
cb ( Qcur , " Qcur " , il ) ;
2023-08-23 23:08:04 +03:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2023-08-23 23:08:04 +03:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2023-11-01 20:11:02 +02:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-11-01 20:11:02 +02:00
}
2023-03-22 07:32:36 +02:00
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
2023-11-01 20:11:02 +02:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
2023-12-27 22:39:45 +07:00
NULL ,
2023-11-01 20:11:02 +02:00
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
2023-03-22 07:32:36 +02:00
2024-06-25 21:47:40 +01:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
2023-03-22 07:32:36 +02:00
}
2025-01-03 10:18:53 +02:00
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , model . output_norm_b ,
2023-11-01 20:11:02 +02:00
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-03-22 07:32:36 +02:00
2025-01-03 10:18:53 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
2023-03-22 07:32:36 +02:00
2023-11-01 20:11:02 +02:00
return gf ;
2023-03-22 07:32:36 +02:00
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_mamba ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-10-28 12:06:08 +03:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-10-28 12:06:08 +03:00
2025-01-03 10:18:53 +02:00
// {n_embd, n_tokens}
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * state_copy = build_inp_s_copy ( ) ;
struct ggml_tensor * state_mask = build_inp_s_mask ( ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
// norm
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_mamba ( ctx0 , lctx , ubatch , gf , cur ,
state_copy , state_mask ,
kv_head , n_kv , cb , il ) ;
2023-11-01 08:04:02 +02:00
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
2023-11-01 20:11:02 +02:00
}
2023-10-28 12:06:08 +03:00
2025-01-03 10:18:53 +02:00
// residual
cur = ggml_add ( ctx0 , cur , inpL ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " l_out " , il ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
// input for next layer
inpL = cur ;
2023-09-16 03:02:13 +08:00
}
2025-01-03 10:18:53 +02:00
// final rmsnorm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2023-11-01 20:11:02 +02:00
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-09-16 03:02:13 +08:00
2023-11-01 20:11:02 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
return gf ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_command_r ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-02-11 10:21:38 -06:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2025-01-03 10:18:53 +02:00
const float f_logit_scale = hparams . f_logit_scale ;
2024-02-11 10:21:38 -06:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-02-13 06:06:58 -06:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-02-11 10:21:38 -06:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2025-01-03 10:18:53 +02:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-02-11 10:21:38 -06:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
struct ggml_tensor * ffn_inp = cur ;
2024-03-04 22:31:20 +02:00
2024-02-11 10:21:38 -06:00
// self-attention
2025-01-03 10:18:53 +02:00
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-02-11 10:21:38 -06:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-11 09:46:09 +02:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-02-11 10:21:38 -06:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-05-11 09:46:09 +02:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-02-11 10:21:38 -06:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . attn_q_norm ) {
Qcur = ggml_view_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ,
ggml_element_size ( Qcur ) * n_embd_head ,
ggml_element_size ( Qcur ) * n_embd_head * n_head ,
0 ) ;
cb ( Qcur , " Qcur " , il ) ;
Kcur = ggml_view_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ,
ggml_element_size ( Kcur ) * n_embd_head ,
ggml_element_size ( Kcur ) * n_embd_head * n_head_kv ,
0 ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-02-13 12:03:53 -05:00
2025-01-03 10:18:53 +02:00
Qcur = llm_build_norm ( ctx0 , Qcur , hparams ,
model . layers [ il ] . attn_q_norm ,
NULL ,
LLM_NORM , cb , il ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-02-13 12:03:53 -05:00
2025-01-03 10:18:53 +02:00
Kcur = llm_build_norm ( ctx0 , Kcur , hparams ,
model . layers [ il ] . attn_k_norm ,
NULL ,
LLM_NORM , cb , il ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-02-13 12:03:53 -05:00
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-02-13 12:03:53 -05:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-02-13 12:03:53 -05:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-03-04 22:31:20 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-02-11 10:21:38 -06:00
}
2025-01-03 10:18:53 +02:00
if ( il = = n_layer - 1 ) {
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
ffn_inp = ggml_get_rows ( ctx0 , ffn_inp , inp_out_ids ) ;
2024-06-06 09:22:41 +02:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * attn_out = cur ;
2024-02-11 10:21:38 -06:00
// feed-forward network
2025-01-03 10:18:53 +02:00
{
cur = llm_build_ffn ( ctx0 , lctx , ffn_inp ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
2024-02-13 12:03:53 -05:00
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
2024-02-13 12:03:53 -05:00
}
2024-02-11 10:21:38 -06:00
2025-01-03 10:18:53 +02:00
// add together residual + FFN + self-attention
cur = ggml_add ( ctx0 , cur , inpL ) ;
cur = ggml_add ( ctx0 , cur , attn_out ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-02-11 10:21:38 -06:00
// input for next layer
inpL = cur ;
}
cur = inpL ;
2024-09-28 17:42:03 +03:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
if ( f_logit_scale ) {
cur = ggml_scale ( ctx0 , cur , f_logit_scale ) ;
}
cb ( cur , " result_output " , - 1 ) ;
2024-02-11 10:21:38 -06:00
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
2025-01-03 10:18:53 +02:00
}
2023-11-01 08:04:02 +02:00
2025-01-04 09:33:31 -05:00
struct ggml_cgraph * build_cohere2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-04 09:33:31 -05:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
const float f_logit_scale = hparams . f_logit_scale ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
// cohere2 requires different mask for layers using sliding window (SWA)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa ( ) ;
// sliding window switch pattern
const int32_t sliding_window_pattern = 4 ;
for ( int il = 0 ; il < n_layer ; + + il ) {
// three layers sliding window attention (window size 4096) and ROPE
// fourth layer uses global attention without positional embeddings
const bool is_sliding = il % sliding_window_pattern < ( sliding_window_pattern - 1 ) ;
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask ;
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams , model . layers [ il ] . attn_norm , NULL , LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
struct ggml_tensor * ffn_inp = cur ;
// self-attention
{
// rope freq factors for 128k context
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
if ( is_sliding ) {
Qcur = ggml_rope_ext ( ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , rope_factors ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale , ext_factor , attn_factor ,
beta_fast , beta_slow ) ;
cb ( Qcur , " Qcur " , il ) ;
Kcur = ggml_rope_ext ( ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos ,
rope_factors , n_rot , rope_type , n_ctx_orig , freq_base , freq_scale , ext_factor ,
attn_factor , beta_fast , beta_slow ) ;
cb ( Kcur , " Kcur " , il ) ;
} else {
// For non-sliding layers, just reshape without applying RoPE
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
cb ( Qcur , " Qcur " , il ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
cb ( Kcur , " Kcur " , il ) ;
}
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf , model . layers [ il ] . wo , model . layers [ il ] . bo , Kcur , Vcur , Qcur ,
KQ_mask_l , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
ffn_inp = ggml_get_rows ( ctx0 , ffn_inp , inp_out_ids ) ;
}
struct ggml_tensor * attn_out = cur ;
// feed-forward network
{
cur = llm_build_ffn ( ctx0 , lctx , ffn_inp , model . layers [ il ] . ffn_up , NULL , NULL , model . layers [ il ] . ffn_gate ,
NULL , NULL , model . layers [ il ] . ffn_down , NULL , NULL , NULL , LLM_FFN_SILU , LLM_FFN_PAR ,
cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
// add together residual + FFN + self-attention
cur = ggml_add ( ctx0 , cur , inpL ) ;
cur = ggml_add ( ctx0 , cur , attn_out ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams , model . output_norm , NULL , LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
if ( f_logit_scale ) {
cur = ggml_scale ( ctx0 , cur , f_logit_scale ) ;
}
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
// ref: https://allenai.org/olmo
// based on the original build_llama() function, changes:
// * non-parametric layer norm
// * clamp qkv
// * removed bias
// * removed MoE
struct ggml_cgraph * build_olmo ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
const int64_t n_embd_head = hparams . n_embd_head_v ;
2024-01-02 03:51:28 -08:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-01-02 03:51:28 -08:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-11-01 08:04:02 +02:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-11-01 20:11:02 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
// norm
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
NULL , NULL ,
2023-11-01 20:11:02 +02:00
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
// self-attention
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( hparams . f_clamp_kqv > 0.0f ) {
Qcur = ggml_clamp ( ctx0 , Qcur , - hparams . f_clamp_kqv , hparams . f_clamp_kqv ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( hparams . f_clamp_kqv > 0.0f ) {
Kcur = ggml_clamp ( ctx0 , Kcur , - hparams . f_clamp_kqv , hparams . f_clamp_kqv ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( hparams . f_clamp_kqv > 0.0f ) {
Vcur = ggml_clamp ( ctx0 , Vcur , - hparams . f_clamp_kqv , hparams . f_clamp_kqv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2023-11-01 20:11:02 +02:00
cb ( Qcur , " Qcur " , il ) ;
2023-10-28 12:06:08 +03:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2023-09-16 03:02:13 +08:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , nullptr ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-11-01 20:11:02 +02:00
}
2023-09-16 03:02:13 +08:00
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
2023-11-01 20:11:02 +02:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// feed-forward network
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
NULL , NULL ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2023-11-01 20:11:02 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2023-11-01 08:04:02 +02:00
2024-06-25 21:47:40 +01:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
2023-09-16 03:02:13 +08:00
}
2025-01-03 10:18:53 +02:00
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
NULL , NULL ,
2023-11-01 20:11:02 +02:00
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-10-28 12:06:08 +03:00
2025-01-03 10:18:53 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-10-28 12:06:08 +03:00
2023-11-01 20:11:02 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-11-01 08:04:02 +02:00
2023-11-01 20:11:02 +02:00
return gf ;
2023-09-16 03:02:13 +08:00
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_olmo2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-01-02 03:51:28 -08:00
2023-11-01 20:11:02 +02:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2023-10-07 00:12:43 -07:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-11-01 20:11:02 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-10-07 00:12:43 -07:00
2023-11-01 20:11:02 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
cur = inpL ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// self_attention
2023-11-01 20:11:02 +02:00
{
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( Vcur , " Vcur " , il ) ;
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
Qcur = llm_build_norm ( ctx0 , Qcur , hparams , model . layers [ il ] . attn_q_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( Qcur , " Qcur_normed " , il ) ;
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
Kcur = llm_build_norm ( ctx0 , Kcur , hparams , model . layers [ il ] . attn_k_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( Kcur , " Kcur_normed " , il ) ;
2024-04-04 02:05:10 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-04-04 02:05:10 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur_rope " , il ) ;
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur_rope " , il ) ;
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
model . layers [ il ] . wo , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-11-01 20:11:02 +02:00
}
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . attn_post_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_post_norm " , il ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
2023-11-01 20:11:02 +02:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-10-07 00:12:43 -07:00
2025-01-03 10:18:53 +02:00
// feed-forward network
cur = llm_build_ffn ( ctx0 , lctx , ffn_inp ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . ffn_post_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " ffn_post_norm " , - 1 ) ;
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
2023-10-07 00:12:43 -07:00
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " l_out " , il ) ;
2023-10-07 00:12:43 -07:00
2023-11-01 20:11:02 +02:00
// input for next layer
inpL = cur ;
2023-11-01 08:04:02 +02:00
}
2023-10-07 00:12:43 -07:00
2023-11-01 20:11:02 +02:00
cur = inpL ;
2023-10-10 22:48:21 +08:00
2023-11-01 20:11:02 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_norm " , - 1 ) ;
2023-11-01 08:04:02 +02:00
2025-01-03 10:18:53 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-01 20:11:02 +02:00
cb ( cur , " result_output " , - 1 ) ;
2023-10-10 22:48:21 +08:00
2023-11-01 20:11:02 +02:00
ggml_build_forward_expand ( gf , cur ) ;
2023-10-10 22:48:21 +08:00
2023-11-01 20:11:02 +02:00
return gf ;
2023-11-01 08:04:02 +02:00
}
2023-11-14 11:17:12 +01:00
2025-01-03 10:18:53 +02:00
// based on the build_qwen2moe() function, changes:
// * removed shared experts
// * removed bias
// * added q, k norm
struct ggml_cgraph * build_olmoe ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2023-11-14 11:17:12 +01:00
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-01-02 03:51:28 -08:00
2023-11-14 11:17:12 +01:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-11-14 11:17:12 +01:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-11-14 11:17:12 +01:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-11-14 11:17:12 +01:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2023-11-14 11:17:12 +01:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
2023-11-14 11:17:12 +01:00
cb ( cur , " attn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
// self_attention
2023-11-14 11:17:12 +01:00
{
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2023-11-21 16:22:30 +01:00
cb ( Qcur , " Qcur " , il ) ;
2023-11-14 11:17:12 +01:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2023-11-21 16:22:30 +01:00
cb ( Kcur , " Kcur " , il ) ;
2023-11-14 11:17:12 +01:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2023-11-14 11:17:12 +01:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
Qcur = llm_build_norm ( ctx0 , Qcur , hparams , model . layers [ il ] . attn_q_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( Qcur , " Qcur_normed " , il ) ;
2024-04-16 08:48:35 -07:00
2025-01-03 10:18:53 +02:00
Kcur = llm_build_norm ( ctx0 , Kcur , hparams , model . layers [ il ] . attn_k_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( Kcur , " Kcur_normed " , il ) ;
2024-04-16 08:48:35 -07:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2024-04-16 08:48:35 -07:00
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
ctx0 , Qcur , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2023-11-21 16:22:30 +01:00
ext_factor , attn_factor , beta_fast , beta_slow
2023-11-14 11:17:12 +01:00
) ;
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur_rope " , il ) ;
2023-11-14 11:17:12 +01:00
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
ctx0 , Kcur , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2023-11-21 16:22:30 +01:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2025-01-03 10:18:53 +02:00
cb ( Kcur , " Kcur_rope " , il ) ;
2023-11-14 11:17:12 +01:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2023-11-14 11:17:12 +01:00
model . layers [ il ] . wo , NULL ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-11-14 11:17:12 +01:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
2023-11-14 11:17:12 +01:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2025-01-03 10:18:53 +02:00
// MoE branch
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_SILU , false ,
false , 0.0 ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
2023-11-14 11:17:12 +01:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-11-14 11:17:12 +01:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2023-11-14 11:17:12 +01:00
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-11-14 11:17:12 +01:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_openelm ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-12-02 02:16:31 +08:00
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2023-12-02 02:16:31 +08:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-12-02 02:16:31 +08:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-12-02 02:16:31 +08:00
2023-12-07 13:03:17 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-12-02 02:16:31 +08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
const int64_t n_head = hparams . n_head ( il ) ;
const int64_t n_head_kv = hparams . n_head_kv ( il ) ;
const int64_t n_head_qkv = 2 * n_head_kv + n_head ;
cur = inpL ;
struct ggml_tensor * residual = cur ;
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
// norm
2023-12-02 02:16:31 +08:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
2023-12-02 02:16:31 +08:00
cb ( cur , " wqkv " , il ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_reshape_3d ( ctx0 , cur , n_embd_head_k , n_head_qkv , n_tokens ) ;
2023-12-02 02:16:31 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_3d ( ctx0 , cur , n_embd_head , n_head , n_tokens , cur - > nb [ 1 ] , cur - > nb [ 2 ] , 0 ) ) ;
2023-12-02 02:16:31 +08:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_3d ( ctx0 , cur , n_embd_head , n_head_kv , n_tokens , cur - > nb [ 1 ] , cur - > nb [ 2 ] , cur - > nb [ 1 ] * n_head ) ) ;
2023-12-02 02:16:31 +08:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_3d ( ctx0 , cur , n_embd_head , n_head_kv , n_tokens , cur - > nb [ 1 ] , cur - > nb [ 2 ] , cur - > nb [ 1 ] * ( n_head + n_head_kv ) ) ) ;
2023-12-02 02:16:31 +08:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
Qcur = llm_build_norm ( ctx0 , Qcur , hparams ,
model . layers [ il ] . attn_q_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( Qcur , " Qcur " , il ) ;
Kcur = llm_build_norm ( ctx0 , Kcur , hparams ,
model . layers [ il ] . attn_k_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( Kcur , " Kcur " , il ) ;
2023-12-02 02:16:31 +08:00
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , Qcur , inp_pos , NULL , n_rot , rope_type , n_ctx_orig ,
2023-12-02 02:16:31 +08:00
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , Kcur , inp_pos , NULL , n_rot , rope_type , n_ctx_orig ,
2023-12-02 02:16:31 +08:00
freq_base , freq_scale , ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
Vcur = ggml_reshape_2d ( ctx0 , Vcur , n_embd_head * n_head_kv , n_tokens ) ;
cb ( Qcur , " Vcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2023-12-02 02:16:31 +08:00
model . layers [ il ] . wo , NULL ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2023-12-02 02:16:31 +08:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
residual = ggml_get_rows ( ctx0 , residual , inp_out_ids ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , residual , cur ) ;
2023-12-02 02:16:31 +08:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2025-01-03 10:18:53 +02:00
// feed-forward network
2023-12-02 02:16:31 +08:00
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
2023-12-27 22:39:45 +07:00
NULL ,
2023-12-02 02:16:31 +08:00
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
}
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-12-02 02:16:31 +08:00
cb ( cur , " l_out " , il ) ;
inpL = cur ;
}
cur = inpL ;
2025-01-03 10:18:53 +02:00
// norm
2023-12-02 02:16:31 +08:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-12-02 02:16:31 +08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
2023-12-18 17:27:47 +00:00
return gf ;
}
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_gptneox ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-01-19 19:53:13 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
2025-01-03 10:18:53 +02:00
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
2024-01-19 19:53:13 +08:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-01-19 19:53:13 +08:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-01-19 19:53:13 +08:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-01-19 19:53:13 +08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
2024-01-19 19:53:13 +08:00
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
2024-01-19 19:53:13 +08:00
cb ( Vcur , " Vcur " , il ) ;
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-01-19 19:53:13 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-01-19 19:53:13 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2024-01-19 19:53:13 +08:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-01-19 19:53:13 +08:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
// ffn
if ( hparams . use_par_res ) {
// attention and ffn are computed in parallel
// x = x + attn(ln1(x)) + ffn(ln2(x))
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * attn_out = cur ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , inpL ) ;
cb ( cur , " ffn_out " , il ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , attn_out ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
2024-01-19 19:53:13 +08:00
2025-01-03 10:18:53 +02:00
// input for next layer
inpL = cur ;
} else {
// attention and ffn are computed sequentially
// x = x + attn(ln1(x))
// x = x + ffn(ln2(x))
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
}
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
2024-01-19 19:53:13 +08:00
cb ( cur , " result_norm " , - 1 ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-01-19 19:53:13 +08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_arctic ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
// inp_pos - contains the positions
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , NULL ,
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// feed-forward network
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_out = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( ffn_out , " ffn_out " , il ) ;
// MoE
cur = llm_build_norm ( ctx0 , inpSA , hparams ,
model . layers [ il ] . ffn_norm_exps , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm_exps " , il ) ;
cur = llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_SILU , true ,
false , 0.0 ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( cur , " ffn_moe_out " , il ) ;
cur = ggml_add ( ctx0 , cur , ffn_out ) ;
cb ( cur , " ffn_out " , il ) ;
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_deepseek ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-04-16 23:40:48 +08:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-04-16 23:40:48 +08:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2025-01-03 10:18:53 +02:00
const float kq_scale = hparams . f_attention_scale = = 0.0f ? 1.0f / sqrtf ( float ( n_embd_head ) ) : hparams . f_attention_scale ;
2024-04-16 23:40:48 +08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
// self-attention
2024-04-16 23:40:48 +08:00
{
2025-01-03 10:18:53 +02:00
// rope freq factors for llama3; may return nullptr for llama2 and other models
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
2024-04-16 23:40:48 +08:00
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-04-16 23:40:48 +08:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2024-04-16 23:40:48 +08:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-04-16 23:40:48 +08:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-04-16 23:40:48 +08:00
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-04-16 23:40:48 +08:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-04-16 23:40:48 +08:00
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , rope_factors ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-04-16 23:40:48 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , rope_factors ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-04-16 23:40:48 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2024-04-16 23:40:48 +08:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2025-01-03 10:18:53 +02:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , kq_scale , cb , il ) ;
2024-04-16 23:40:48 +08:00
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2025-01-03 10:18:53 +02:00
2024-04-16 23:40:48 +08:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
if ( ( uint32_t ) il < hparams . n_layer_dense_lead ) {
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
2024-04-16 23:40:48 +08:00
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
} else {
// MoE branch
ggml_tensor * moe_out =
llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
nullptr ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
LLM_FFN_SILU , false ,
false , hparams . expert_weights_scale ,
2025-01-04 21:06:11 +01:00
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( moe_out , " ffn_moe_out " , il ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
// FFN shared expert
{
ggml_tensor * ffn_shexp = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up_shexp , NULL , NULL ,
model . layers [ il ] . ffn_gate_shexp , NULL , NULL ,
model . layers [ il ] . ffn_down_shexp , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( ffn_shexp , " ffn_shexp " , il ) ;
2024-04-16 23:40:48 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , moe_out , ffn_shexp ) ;
cb ( cur , " ffn_out " , il ) ;
}
2024-04-16 23:40:48 +08:00
}
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2024-04-16 23:40:48 +08:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2025-01-03 10:18:53 +02:00
2024-04-16 23:40:48 +08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_deepseek2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
bool is_lite = ( hparams . n_layer = = 27 ) ;
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = attn_factor * ( 1.0f + hparams . rope_yarn_log_mul * logf ( 1.0f / freq_scale ) ) ;
const float kq_scale = 1.0f * mscale * mscale / sqrtf ( float ( hparams . n_embd_head_k ) ) ;
const float attn_factor_scaled = 1.0f / ( 1.0f + 0.1f * logf ( 1.0f / freq_scale ) ) ;
const uint32_t n_embd_head_qk_rope = hparams . n_rot ;
const uint32_t n_embd_head_qk_nope = hparams . n_embd_head_k - hparams . n_rot ;
const uint32_t kv_lora_rank = hparams . n_lora_kv ;
2024-01-02 03:51:28 -08:00
2023-12-18 17:27:47 +00:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2025-01-03 10:18:53 +02:00
// {n_embd, n_tokens}
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-12-18 17:27:47 +00:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2023-12-18 17:27:47 +00:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2023-12-18 17:27:47 +00:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self_attention
2023-12-18 17:27:47 +00:00
{
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q = NULL ;
if ( ! is_lite ) {
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq_a , cur ) ;
cb ( q , " q " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
q = llm_build_norm ( ctx0 , q , hparams ,
model . layers [ il ] . attn_q_a_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( q , " q " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq_b , q ) ;
cb ( q , " q " , il ) ;
2024-01-13 13:44:37 +02:00
} else {
2025-01-03 10:18:53 +02:00
q = ggml_mul_mat ( ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( q , " q " , il ) ;
2024-01-13 13:44:37 +02:00
}
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * q_nope = ggml_view_3d ( ctx0 , q , n_embd_head_qk_nope , n_head , n_tokens ,
ggml_row_size ( q - > type , hparams . n_embd_head_k ) ,
ggml_row_size ( q - > type , hparams . n_embd_head_k * n_head ) ,
0 ) ;
cb ( q_nope , " q_nope " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// and {n_head * n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * q_pe = ggml_view_3d ( ctx0 , q , n_embd_head_qk_rope , n_head , n_tokens ,
ggml_row_size ( q - > type , hparams . n_embd_head_k ) ,
ggml_row_size ( q - > type , hparams . n_embd_head_k * n_head ) ,
ggml_row_size ( q - > type , n_embd_head_qk_nope ) ) ;
cb ( q_pe , " q_pe " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat ( ctx0 , model . layers [ il ] . wkv_a_mqa , cur ) ;
cb ( kv_pe_compresseed , " kv_pe_compresseed " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// split into {kv_lora_rank, n_tokens}
struct ggml_tensor * kv_compressed = ggml_view_2d ( ctx0 , kv_pe_compresseed , kv_lora_rank , n_tokens ,
kv_pe_compresseed - > nb [ 1 ] ,
0 ) ;
cb ( kv_compressed , " kv_compressed " , il ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// and {n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * k_pe = ggml_view_3d ( ctx0 , kv_pe_compresseed , n_embd_head_qk_rope , 1 , n_tokens ,
kv_pe_compresseed - > nb [ 1 ] ,
kv_pe_compresseed - > nb [ 1 ] ,
ggml_row_size ( kv_pe_compresseed - > type , kv_lora_rank ) ) ;
cb ( k_pe , " k_pe " , il ) ;
kv_compressed = ggml_cont ( ctx0 , kv_compressed ) ; // TODO: the CUDA backend does not support non-contiguous norm
kv_compressed = llm_build_norm ( ctx0 , kv_compressed , hparams ,
model . layers [ il ] . attn_kv_a_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( kv_compressed , " kv_compressed " , il ) ;
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
struct ggml_tensor * kv = ggml_mul_mat ( ctx0 , model . layers [ il ] . wkv_b , kv_compressed ) ;
cb ( kv , " kv " , il ) ;
// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * k_nope = ggml_view_3d ( ctx0 , kv , n_embd_head_qk_nope , n_head , n_tokens ,
ggml_row_size ( kv - > type , n_embd_head_qk_nope + hparams . n_embd_head_v ) ,
ggml_row_size ( kv - > type , n_head * ( n_embd_head_qk_nope + hparams . n_embd_head_v ) ) ,
0 ) ;
cb ( k_nope , " k_nope " , il ) ;
// and {n_head * n_embd_head_v, n_tokens}
struct ggml_tensor * v_states = ggml_view_3d ( ctx0 , kv , hparams . n_embd_head_v , n_head , n_tokens ,
ggml_row_size ( kv - > type , ( n_embd_head_qk_nope + hparams . n_embd_head_v ) ) ,
ggml_row_size ( kv - > type , ( n_embd_head_qk_nope + hparams . n_embd_head_v ) * n_head ) ,
ggml_row_size ( kv - > type , ( n_embd_head_qk_nope ) ) ) ;
cb ( v_states , " v_states " , il ) ;
v_states = ggml_cont ( ctx0 , v_states ) ;
cb ( v_states , " v_states " , il ) ;
v_states = ggml_view_2d ( ctx0 , v_states , hparams . n_embd_head_v * n_head , n_tokens ,
ggml_row_size ( kv - > type , hparams . n_embd_head_v * n_head ) ,
0 ) ;
cb ( v_states , " v_states " , il ) ;
2025-01-15 12:51:37 +01:00
q_pe = ggml_cont ( ctx0 , q_pe ) ; // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
2025-01-03 10:18:53 +02:00
q_pe = ggml_rope_ext (
ctx0 , q_pe , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor_scaled , beta_fast , beta_slow
2023-12-18 17:27:47 +00:00
) ;
2025-01-03 10:18:53 +02:00
cb ( q_pe , " q_pe " , il ) ;
// shared RoPE key
2025-01-15 12:51:37 +01:00
k_pe = ggml_cont ( ctx0 , k_pe ) ; // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
2025-01-03 10:18:53 +02:00
k_pe = ggml_rope_ext (
ctx0 , k_pe , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor_scaled , beta_fast , beta_slow
) ;
cb ( k_pe , " k_pe " , il ) ;
struct ggml_tensor * q_states = ggml_concat ( ctx0 , q_nope , q_pe , 0 ) ;
cb ( q_states , " q_states " , il ) ;
struct ggml_tensor * k_states = ggml_concat ( ctx0 , k_nope , ggml_repeat ( ctx0 , k_pe , q_pe ) , 0 ) ;
cb ( k_states , " k_states " , il ) ;
2023-12-18 17:27:47 +00:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , NULL ,
k_states , v_states , q_states , KQ_mask , n_tokens , kv_head , n_kv , kq_scale , cb , il ) ;
2023-12-18 17:27:47 +00:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
if ( ( uint32_t ) il < hparams . n_layer_dense_lead ) {
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
2023-12-27 22:39:45 +07:00
NULL ,
2025-01-03 10:18:53 +02:00
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
} else {
// MoE branch
ggml_tensor * moe_out =
llm_build_moe_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_gate_inp ,
model . layers [ il ] . ffn_up_exps ,
model . layers [ il ] . ffn_gate_exps ,
model . layers [ il ] . ffn_down_exps ,
2025-01-04 21:06:11 +01:00
model . layers [ il ] . ffn_exp_probs_b ,
2025-01-03 10:18:53 +02:00
n_expert , n_expert_used ,
2025-01-04 21:06:11 +01:00
LLM_FFN_SILU , hparams . expert_weights_norm ,
2025-01-03 10:18:53 +02:00
true , hparams . expert_weights_scale ,
2025-01-04 21:06:11 +01:00
( enum llama_expert_gating_func_type ) hparams . expert_gating_func ,
2025-01-03 10:18:53 +02:00
cb , il ) ;
cb ( moe_out , " ffn_moe_out " , il ) ;
// FFN shared expert
{
ggml_tensor * ffn_shexp = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up_shexp , NULL , NULL ,
model . layers [ il ] . ffn_gate_shexp , NULL , NULL ,
model . layers [ il ] . ffn_down_shexp , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( ffn_shexp , " ffn_shexp " , il ) ;
cur = ggml_add ( ctx0 , moe_out , ffn_shexp ) ;
cb ( cur , " ffn_out " , il ) ;
}
2023-12-18 17:27:47 +00:00
}
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2023-12-18 17:27:47 +00:00
cb ( cur , " l_out " , il ) ;
2024-06-25 21:47:40 +01:00
// input for next layer
2023-12-18 17:27:47 +00:00
inpL = cur ;
}
2025-01-03 10:18:53 +02:00
cur = inpL ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2023-12-18 17:27:47 +00:00
2025-01-03 10:18:53 +02:00
// lm_head
cur = ggml_mul_mat ( ctx0 , model . output , cur ) ;
2023-12-18 17:27:47 +00:00
cb ( cur , " result_output " , - 1 ) ;
2025-01-03 10:18:53 +02:00
2024-04-24 15:00:37 +08:00
ggml_build_forward_expand ( gf , cur ) ;
2025-01-03 10:18:53 +02:00
2024-04-24 15:00:37 +08:00
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_bitnet ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-04-24 15:00:37 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-04-24 15:00:37 +08:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2025-01-03 10:18:53 +02:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-04-24 15:00:37 +08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2024-05-22 16:10:46 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
2024-04-24 15:00:37 +08:00
LLM_NORM_RMS , cb , il ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " attn_norm " , il ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
if ( model . layers [ il ] . wq_scale ) {
Qcur = ggml_mul ( ctx0 , Qcur , model . layers [ il ] . wq_scale ) ;
2024-04-24 15:00:37 +08:00
}
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
2024-04-24 15:00:37 +08:00
}
2025-01-03 10:18:53 +02:00
// B1.K
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
if ( model . layers [ il ] . wk_scale ) {
Kcur = ggml_mul ( ctx0 , Kcur , model . layers [ il ] . wk_scale ) ;
}
2024-04-24 15:00:37 +08:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
// B1.V
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
if ( model . layers [ il ] . wv_scale ) {
Vcur = ggml_mul ( ctx0 , Vcur , model . layers [ il ] . wv_scale ) ;
}
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2024-04-24 15:00:37 +08:00
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
2024-04-24 15:00:37 +08:00
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
NULL , NULL ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . attn_sub_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_sub_norm " , il ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wo , cur ) ;
if ( model . layers [ il ] . wo_scale ) {
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . wo_scale ) ;
}
if ( model . layers [ il ] . bo ) {
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bo ) ;
}
cb ( cur , " attn_o_out " , il ) ;
2024-04-24 15:00:37 +08:00
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
2024-04-24 15:00:37 +08:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
// feed-forward forward
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
2024-04-24 15:00:37 +08:00
cb ( cur , " ffn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , model . layers [ il ] . ffn_up_scale ,
model . layers [ il ] . ffn_gate , NULL , model . layers [ il ] . ffn_gate_scale ,
NULL , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_sub_out " , il ) ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . ffn_sub_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_sub_norm " , il ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . ffn_down , cur ) ;
if ( model . layers [ il ] . ffn_down_scale ) {
cur = ggml_mul ( ctx0 , cur , model . layers [ il ] . ffn_down_scale ) ;
2024-04-24 15:00:37 +08:00
}
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_down " , il ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2024-04-24 15:00:37 +08:00
cb ( cur , " l_out " , il ) ;
2024-06-25 21:47:40 +01:00
// input for next layer
2024-04-24 15:00:37 +08:00
inpL = cur ;
}
2025-01-03 10:18:53 +02:00
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2024-04-24 15:00:37 +08:00
cb ( cur , " result_norm " , - 1 ) ;
2025-01-03 10:18:53 +02:00
// lm_head
// FIXME: do not use model.tok_embd directly, duplicate as model.output
cur = llm_build_lora_mm ( lctx , ctx0 , model . tok_embd , cur ) ;
2024-04-24 15:00:37 +08:00
cb ( cur , " result_output " , - 1 ) ;
2023-12-18 17:27:47 +00:00
ggml_build_forward_expand ( gf , cur ) ;
2023-12-02 02:16:31 +08:00
return gf ;
}
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_t5_enc ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-04-24 15:00:37 +08:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2023-12-24 22:35:49 +09:00
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
2025-01-03 10:18:53 +02:00
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
2024-01-02 03:51:28 -08:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2023-12-24 22:35:49 +09:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( lctx . is_encoding ) ;
struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket ( false ) ;
2023-12-24 22:35:49 +09:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2025-01-03 10:18:53 +02:00
struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask ( false ) ;
2023-12-24 22:35:49 +09:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2023-12-24 22:35:49 +09:00
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm_enc , NULL ,
2023-12-24 22:35:49 +09:00
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq_enc , cur ) ;
2023-12-24 22:35:49 +09:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk_enc , cur ) ;
2023-12-24 22:35:49 +09:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv_enc , cur ) ;
2023-12-24 22:35:49 +09:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
struct ggml_tensor * k = ggml_cont ( ctx0 , ggml_permute ( ctx0 , Kcur , 0 , 2 , 1 , 3 ) ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * kq = ggml_mul_mat ( ctx0 , k , q ) ;
cb ( kq , " kq " , il ) ;
2023-12-24 22:35:49 +09:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * attn_rel_b = model . layers [ il ] . attn_rel_b_enc ? model . layers [ il ] . attn_rel_b_enc : model . layers [ 0 ] . attn_rel_b_enc ;
struct ggml_tensor * pos_bias = llm_build_pos_bias ( pos_bucket_enc , attn_rel_b ) ;
struct ggml_tensor * kq_b = ggml_add ( ctx0 , kq , pos_bias ) ;
cb ( kq_b , " kq_b " , il ) ;
kq = ggml_soft_max_ext ( ctx0 , kq_b , KQ_mask_enc , 1.0f , hparams . f_max_alibi_bias ) ;
cb ( kq , " kq_soft_max_ext " , il ) ;
struct ggml_tensor * v = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , Vcur , n_embd_gqa , n_tokens ) ) ) ;
cb ( v , " v " , il ) ;
struct ggml_tensor * kqv = ggml_mul_mat ( ctx0 , ggml_reshape_3d ( ctx0 , v , n_tokens , n_embd_head , n_head_kv ) , kq ) ;
cb ( kqv , " kqv " , il ) ;
struct ggml_tensor * kqv_merged = ggml_permute ( ctx0 , kqv , 0 , 2 , 1 , 3 ) ;
cb ( kqv_merged , " kqv_merged " , il ) ;
cur = ggml_cont_2d ( ctx0 , kqv_merged , n_embd_gqa , n_tokens ) ;
cb ( cur , " kqv_merged_cont " , il ) ;
ggml_build_forward_expand ( gf , cur ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wo_enc , cur ) ;
cb ( cur , " kqv_out " , il ) ;
}
2023-12-24 22:35:49 +09:00
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2023-12-24 22:35:49 +09:00
// feed-forward network
{
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm_enc , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
// T5 uses relu, flan-T5 uses gelu-gated
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_up_enc , NULL , NULL ,
model . layers [ il ] . ffn_gate_enc , NULL , NULL ,
model . layers [ il ] . ffn_down_enc , NULL , NULL ,
2023-12-27 22:39:45 +07:00
NULL ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU ,
model . layers [ il ] . ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ ,
cb , il ) ;
2023-12-24 22:35:49 +09:00
cb ( cur , " ffn_out " , il ) ;
}
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
ggml_tensor * layer_dir = lctx . cvec . tensor_for ( il ) ;
if ( layer_dir ! = nullptr ) {
cur = ggml_add ( ctx0 , cur , layer_dir ) ;
}
2023-12-24 22:35:49 +09:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
2025-01-03 10:18:53 +02:00
cb ( cur , " result_embd " , - 1 ) ;
2023-12-24 22:35:49 +09:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm_enc , NULL ,
2023-12-24 22:35:49 +09:00
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_t5_dec ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-01-02 03:51:28 -08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2023-12-28 09:03:57 -05:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( ! lctx . is_encoding ) ;
GGML_ASSERT ( n_outputs_enc > 0 & & " call llama_encode() first " ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * embd_enc = llm_build_inp_embd_enc ( ) ;
struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket ( true ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask ( ) ;
struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross ( ) ;
2023-12-28 09:03:57 -05:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
// norm
2023-12-28 09:03:57 -05:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
2023-12-28 09:03:57 -05:00
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
llm_build_kv_store ( ctx0 , hparams , cparams , kv_self , gf , Kcur , Vcur , n_tokens , kv_head , cb , il ) ;
struct ggml_tensor * k =
ggml_view_3d ( ctx0 , kv_self . k_l [ il ] ,
n_embd_head_k , n_kv , n_head_kv ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa ) ,
ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_head_k ) ,
0 ) ;
cb ( k , " k " , il ) ;
struct ggml_tensor * v =
ggml_view_3d ( ctx0 , kv_self . v_l [ il ] ,
n_kv , n_embd_head_v , n_head_kv ,
ggml_element_size ( kv_self . v_l [ il ] ) * n_ctx ,
ggml_element_size ( kv_self . v_l [ il ] ) * n_ctx * n_embd_head_v ,
0 ) ;
cb ( v , " v " , il ) ;
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
struct ggml_tensor * q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
struct ggml_tensor * kq = ggml_mul_mat ( ctx0 , k , q ) ;
cb ( kq , " kq " , il ) ;
struct ggml_tensor * attn_rel_b = model . layers [ il ] . attn_rel_b ? model . layers [ il ] . attn_rel_b : model . layers [ 0 ] . attn_rel_b ;
struct ggml_tensor * pos_bias = llm_build_pos_bias ( pos_bucket_dec , attn_rel_b ) ;
struct ggml_tensor * kq_b = ggml_add ( ctx0 , kq , pos_bias ) ;
cb ( kq_b , " kq_b " , il ) ;
kq = ggml_soft_max_ext ( ctx0 , kq_b , KQ_mask_dec , 1.0f , hparams . f_max_alibi_bias ) ;
cb ( kq , " kq_soft_max_ext " , il ) ;
struct ggml_tensor * kqv = ggml_mul_mat ( ctx0 , v , kq ) ;
cb ( kqv , " kqv " , il ) ;
struct ggml_tensor * kqv_merged = ggml_permute ( ctx0 , kqv , 0 , 2 , 1 , 3 ) ;
cb ( kqv_merged , " kqv_merged " , il ) ;
cur = ggml_cont_2d ( ctx0 , kqv_merged , n_embd_gqa , n_tokens ) ;
cb ( cur , " kqv_merged_cont " , il ) ;
ggml_build_forward_expand ( gf , cur ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wo , cur ) ;
cb ( cur , " kqv_out " , il ) ;
}
cur = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( cur , " cross_inp " , il ) ;
struct ggml_tensor * inpCA = cur ;
// norm
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . attn_norm_cross , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm_cross " , il ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
// cross-attention
{
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq_cross , cur ) ;
2023-12-28 09:03:57 -05:00
cb ( Qcur , " Qcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk_cross , embd_enc ) ;
2023-12-28 09:03:57 -05:00
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv_cross , embd_enc ) ;
2023-12-28 09:03:57 -05:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
Kcur = ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_outputs_enc ) ;
2023-12-28 09:03:57 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q = ggml_permute ( ctx0 , Qcur , 0 , 2 , 1 , 3 ) ;
struct ggml_tensor * k = ggml_cont ( ctx0 , ggml_permute ( ctx0 , Kcur , 0 , 2 , 1 , 3 ) ) ;
struct ggml_tensor * kq = ggml_mul_mat ( ctx0 , k , q ) ;
cb ( kq , " kq " , il ) ;
kq = ggml_soft_max_ext ( ctx0 , kq , KQ_mask_cross , 1.0f , hparams . f_max_alibi_bias ) ;
cb ( kq , " kq_soft_max_ext " , il ) ;
struct ggml_tensor * v = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , Vcur , n_embd_gqa , n_outputs_enc ) ) ) ;
cb ( v , " v " , il ) ;
struct ggml_tensor * kqv = ggml_mul_mat ( ctx0 , ggml_reshape_3d ( ctx0 , v , n_outputs_enc , n_embd_head , n_head_kv ) , kq ) ;
cb ( kqv , " kqv " , il ) ;
struct ggml_tensor * kqv_merged = ggml_permute ( ctx0 , kqv , 0 , 2 , 1 , 3 ) ;
cb ( kqv_merged , " kqv_merged " , il ) ;
cur = ggml_cont_2d ( ctx0 , kqv_merged , n_embd_gqa , n_tokens ) ;
cb ( cur , " kqv_merged_cont " , il ) ;
ggml_build_forward_expand ( gf , cur ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wo_cross , cur ) ;
cb ( cur , " kqv_out " , il ) ;
2023-12-28 09:03:57 -05:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
inpCA = ggml_get_rows ( ctx0 , inpCA , inp_out_ids ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpCA ) ;
2023-12-28 09:03:57 -05:00
cb ( ffn_inp , " ffn_inp " , il ) ;
2025-01-03 10:18:53 +02:00
// feed-forward network
2023-12-28 09:03:57 -05:00
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
2023-12-28 09:03:57 -05:00
cb ( cur , " ffn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
// T5 uses relu, flan-T5 uses gelu-gated
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
2023-12-28 09:03:57 -05:00
NULL ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU ,
model . layers [ il ] . ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ ,
cb , il ) ;
2023-12-28 09:03:57 -05:00
cb ( cur , " ffn_out " , il ) ;
}
2024-06-25 21:47:40 +01:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
ggml_tensor * layer_dir = lctx . cvec . tensor_for ( il ) ;
if ( layer_dir ! = nullptr ) {
cur = ggml_add ( ctx0 , cur , layer_dir ) ;
}
2024-06-25 21:47:40 +01:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
2023-12-28 09:03:57 -05:00
}
2025-01-03 10:18:53 +02:00
cur = inpL ;
cb ( cur , " result_embd " , - 1 ) ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2023-12-28 09:03:57 -05:00
cb ( cur , " result_norm " , - 1 ) ;
2025-01-03 10:18:53 +02:00
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2023-12-28 09:03:57 -05:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2024-01-19 17:07:27 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_jais ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-01-19 17:07:27 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-01-19 17:07:27 +08:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-01-19 17:07:27 +08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
2024-01-19 17:07:27 +08:00
cb ( cur , " wqkv " , il ) ;
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * cur - > nb [ 0 ] * ( n_embd ) ) ) ;
struct ggml_tensor * Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * cur - > nb [ 0 ] * ( n_embd ) ) ) ;
struct ggml_tensor * Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * cur - > nb [ 0 ] * ( n_embd + n_embd_gqa ) ) ) ;
2024-01-19 17:07:27 +08:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
2025-01-03 10:18:53 +02:00
cb ( Vcur , " Vcur " , il ) ;
Qcur = ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) ;
2024-01-19 17:07:27 +08:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2024-01-19 17:07:27 +08:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2025-01-03 10:18:53 +02:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / float ( n_embd_head ) , cb , il ) ;
2024-01-19 17:07:27 +08:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpL = ggml_get_rows ( ctx0 , inpL , inp_out_ids ) ;
}
2024-01-19 17:07:27 +08:00
// add the input
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpL ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// FF
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_gate , model . layers [ il ] . ffn_gate_b , NULL ,
2024-06-26 14:27:46 +08:00
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
2024-01-19 17:07:27 +08:00
NULL ,
2025-01-03 10:18:53 +02:00
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
2024-01-19 17:07:27 +08:00
cb ( cur , " ffn_out " , il ) ;
}
2025-01-03 10:18:53 +02:00
inpL = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( inpL , " l_out " , il ) ;
2024-01-19 17:07:27 +08:00
}
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2025-01-03 10:18:53 +02:00
2024-01-19 17:07:27 +08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_chatglm ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-01-31 18:47:10 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
2025-01-03 10:18:53 +02:00
const int64_t n_embd_gqa = hparams . n_embd_v_gqa ( ) ;
2024-01-31 18:47:10 +02:00
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-01-31 18:47:10 +02:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-01-31 18:47:10 +02:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-01-31 18:47:10 +02:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm ,
NULL ,
LLM_NORM_RMS , cb , il ) ;
2024-01-31 18:47:10 +02:00
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Qcur = nullptr ;
struct ggml_tensor * Kcur = nullptr ;
struct ggml_tensor * Vcur = nullptr ;
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wqkv , cur ) ;
cb ( cur , " wqkv " , il ) ;
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . layers [ il ] . bqkv ) ;
cb ( cur , " bqkv " , il ) ;
Qcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd , n_tokens , cur - > nb [ 1 ] , 0 * sizeof ( float ) * ( n_embd ) ) ) ;
Kcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd ) ) ) ;
Vcur = ggml_cont ( ctx0 , ggml_view_2d ( ctx0 , cur , n_embd_gqa , n_tokens , cur - > nb [ 1 ] , 1 * sizeof ( float ) * ( n_embd + n_embd_gqa ) ) ) ;
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
cb ( Kcur , " Kcur " , il ) ;
cb ( Vcur , " Vcur " , il ) ;
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-01-31 18:47:10 +02:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur_rope " , il ) ;
2024-01-31 18:47:10 +02:00
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-01-31 18:47:10 +02:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2025-01-03 10:18:53 +02:00
cb ( Kcur , " Kcur_rope " , il ) ;
2024-01-31 18:47:10 +02:00
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2024-01-31 18:47:10 +02:00
model . layers [ il ] . wo , NULL ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2025-01-03 10:18:53 +02:00
2024-01-31 18:47:10 +02:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2025-01-03 10:18:53 +02:00
// Add the input
2024-01-31 18:47:10 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2025-01-03 10:18:53 +02:00
// FF
{
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm ,
NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SWIGLU , LLM_FFN_SEQ , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
}
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
inpL = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( inpL , " l_out " , il ) ;
}
2024-01-31 18:47:10 +02:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . output_norm ,
NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
2024-01-31 18:47:10 +02:00
cb ( cur , " result_norm " , - 1 ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-01-31 18:47:10 +02:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2024-02-01 17:19:51 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_nemotron ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-02-01 17:19:51 +08:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
2025-01-03 10:18:53 +02:00
//GGML_ASSERT(n_embd_head == hparams.n_rot);
2024-02-01 17:19:51 +08:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-02-01 17:19:51 +08:00
// inp_pos - contains the positions
2024-03-13 18:54:21 +01:00
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
2024-02-01 17:19:51 +08:00
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2024-03-13 18:54:21 +01:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-02-01 17:19:51 +08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . attn_norm ,
model . layers [ il ] . attn_norm_b ,
LLM_NORM , cb , il ) ;
2024-02-01 17:19:51 +08:00
cb ( cur , " attn_norm " , il ) ;
// self-attention
{
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-02-01 17:19:51 +08:00
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-02-01 17:19:51 +08:00
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-02-01 17:19:51 +08:00
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-05-22 04:28:32 +08:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-02-01 17:19:51 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Qcur , " Qcur " , il ) ;
2024-05-22 04:28:32 +08:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
2024-06-05 11:29:20 +03:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
2024-02-01 17:19:51 +08:00
ext_factor , attn_factor , beta_fast , beta_slow
) ;
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2024-02-01 17:19:51 +08:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
2024-05-11 10:32:41 +03:00
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-02-01 17:19:51 +08:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
2024-02-01 17:19:51 +08:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// feed-forward network
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_norm ,
model . layers [ il ] . ffn_norm_b ,
LLM_NORM , cb , il ) ;
2024-02-01 17:19:51 +08:00
cb ( cur , " ffn_norm " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . ffn_up , model . layers [ il ] . ffn_up_b , NULL ,
NULL , NULL , NULL ,
model . layers [ il ] . ffn_down , model . layers [ il ] . ffn_down_b , NULL ,
2024-02-01 17:19:51 +08:00
NULL ,
2025-01-03 10:18:53 +02:00
LLM_FFN_RELU_SQR , LLM_FFN_SEQ , cb , il ) ;
2024-02-01 17:19:51 +08:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2024-02-01 17:19:51 +08:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm , model . output_norm_b ,
LLM_NORM , cb , - 1 ) ;
2024-02-01 17:19:51 +08:00
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-02-01 17:19:51 +08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_exaone ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-09-16 14:45:20 +08:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-09-16 14:45:20 +08:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
struct ggml_tensor * inpSA = inpL ;
// norm
cur = llm_build_norm ( ctx0 , inpL , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " attn_norm " , il ) ;
2025-01-03 10:18:53 +02:00
// self-attention
2024-09-16 14:45:20 +08:00
{
2025-01-03 10:18:53 +02:00
// rope freq factors for llama3; may return nullptr for llama2 and other models
struct ggml_tensor * rope_factors = build_rope_factors ( il ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
cb ( Qcur , " Qcur " , il ) ;
if ( model . layers [ il ] . bq ) {
Qcur = ggml_add ( ctx0 , Qcur , model . layers [ il ] . bq ) ;
cb ( Qcur , " Qcur " , il ) ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
cb ( Kcur , " Kcur " , il ) ;
if ( model . layers [ il ] . bk ) {
Kcur = ggml_add ( ctx0 , Kcur , model . layers [ il ] . bk ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
cb ( Vcur , " Vcur " , il ) ;
if ( model . layers [ il ] . bv ) {
Vcur = ggml_add ( ctx0 , Vcur , model . layers [ il ] . bv ) ;
cb ( Vcur , " Vcur " , il ) ;
}
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
Qcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , rope_factors ,
2024-09-16 14:45:20 +08:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2025-01-03 10:18:53 +02:00
cb ( Qcur , " Qcur " , il ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
Kcur = ggml_rope_ext (
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , rope_factors ,
2024-09-16 14:45:20 +08:00
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2025-01-03 10:18:53 +02:00
cb ( Kcur , " Kcur " , il ) ;
2024-09-16 14:45:20 +08:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , model . layers [ il ] . bo ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-09-16 14:45:20 +08:00
}
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
2024-09-16 14:45:20 +08:00
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
}
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// feed-forward network
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-09-16 14:45:20 +08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
2024-09-16 14:45:20 +08:00
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " ffn_out " , il ) ;
2024-09-16 14:45:20 +08:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
ggml_cgraph * build_rwkv6 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-03 10:18:53 +02:00
// Token shift state dimensions should be 2 * n_emb
GGML_ASSERT ( n_embd = = hparams . n_embd_k_s ( ) / 2 ) ;
const int64_t n_seqs = ubatch . n_seqs ;
const int64_t n_seq_tokens = ubatch . n_seq_tokens ;
const int64_t n_tokens = ubatch . n_tokens ;
GGML_ASSERT ( n_seqs ! = 0 ) ;
GGML_ASSERT ( ubatch . equal_seqs ) ;
GGML_ASSERT ( n_tokens = = n_seq_tokens * n_seqs ) ;
2024-02-21 05:08:22 -08:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * state_copy = build_inp_s_copy ( ) ;
struct ggml_tensor * state_mask = build_inp_s_mask ( ) ;
2024-02-21 05:08:22 -08:00
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2025-01-03 10:18:53 +02:00
inpL = llm_build_norm ( ctx0 , inpL , hparams , model . tok_norm , model . tok_norm_b , LLM_NORM , cb , - 1 ) ;
2024-02-21 05:08:22 -08:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
const llama_layer * layer = & model . layers [ il ] ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
// (ab)using the KV cache to store the states
struct ggml_tensor * token_shift = llm_build_copy_mask_state ( ctx0 ,
gf , kv_self . k_l [ il ] , state_copy , state_mask ,
hparams . n_embd_k_s ( ) , kv_self . size , kv_head , n_kv , n_seqs ) ;
struct ggml_tensor * wkv_states = llm_build_copy_mask_state ( ctx0 ,
gf , kv_self . v_l [ il ] , state_copy , state_mask ,
hparams . n_embd_v_s ( ) , kv_self . size , kv_head , n_kv , n_seqs ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
cur = ggml_reshape_3d ( ctx0 , inpL , n_embd , n_seq_tokens , n_seqs ) ;
token_shift = ggml_reshape_3d ( ctx0 , token_shift , n_embd , 2 , n_seqs ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * att_shift = ggml_view_3d ( ctx0 , token_shift , n_embd , 1 , n_seqs , token_shift - > nb [ 1 ] , token_shift - > nb [ 2 ] , 0 ) ;
struct ggml_tensor * ffn_shift = ggml_view_3d ( ctx0 , token_shift , n_embd , 1 , n_seqs , token_shift - > nb [ 1 ] , token_shift - > nb [ 2 ] , n_embd * ggml_element_size ( token_shift ) ) ;
2024-02-22 23:22:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * x_norm_att = llm_build_norm ( ctx0 , cur , hparams , layer - > attn_norm , layer - > attn_norm_b , LLM_NORM , cb , il ) ;
struct ggml_tensor * x_prev = ggml_concat (
ctx0 ,
att_shift ,
ggml_view_3d ( ctx0 , x_norm_att , n_embd , n_seq_tokens - 1 , n_seqs , x_norm_att - > nb [ 1 ] , x_norm_att - > nb [ 2 ] , 0 ) ,
1
) ;
2024-02-21 05:08:22 -08:00
2025-01-10 09:58:08 +08:00
cur = ggml_add ( ctx0 , cur , llm_build_rwkv6_time_mix ( lctx , ctx0 , layer , x_norm_att , x_prev , & wkv_states , hparams . wkv_head_size , n_embd / hparams . wkv_head_size ) ) ;
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand ( gf , cur ) ;
ggml_build_forward_expand (
gf ,
ggml_cpy (
ctx0 ,
wkv_states ,
ggml_view_1d (
ctx0 ,
kv_self . v_l [ il ] ,
hparams . n_embd_v_s ( ) * n_seqs ,
hparams . n_embd_v_s ( ) * kv_head * ggml_element_size ( kv_self . v_l [ il ] )
)
)
) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * x_norm_ffn = llm_build_norm ( ctx0 , cur , hparams , layer - > attn_norm_2 , layer - > attn_norm_2_b , LLM_NORM , cb , il ) ;
x_prev = ggml_concat (
ctx0 ,
ffn_shift ,
ggml_view_3d ( ctx0 , x_norm_ffn , n_embd , n_seq_tokens - 1 , n_seqs , x_norm_ffn - > nb [ 1 ] , x_norm_ffn - > nb [ 2 ] , 0 ) ,
1
) ;
cur = ggml_add ( ctx0 , cur , llm_build_rwkv6_channel_mix ( lctx , ctx0 , layer , x_norm_ffn , x_prev ) ) ;
ggml_build_forward_expand ( gf , cur ) ;
2024-02-22 23:22:48 +02:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * last_norm_att = ggml_view_3d ( ctx0 , x_norm_att , n_embd , 1 , n_seqs , x_norm_att - > nb [ 1 ] , x_norm_att - > nb [ 2 ] , ( n_seq_tokens - 1 ) * n_embd * ggml_element_size ( x_norm_att ) ) ;
struct ggml_tensor * last_norm_ffn = ggml_view_3d ( ctx0 , x_norm_ffn , n_embd , 1 , n_seqs , x_norm_ffn - > nb [ 1 ] , x_norm_ffn - > nb [ 2 ] , ( n_seq_tokens - 1 ) * n_embd * ggml_element_size ( x_norm_ffn ) ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
token_shift = ggml_concat ( ctx0 , last_norm_att , last_norm_ffn , 1 ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
ggml_build_forward_expand (
gf ,
ggml_cpy (
ctx0 ,
ggml_view_1d ( ctx0 , token_shift , n_embd * n_seqs * 2 , 0 ) ,
ggml_view_1d ( ctx0 , kv_self . k_l [ il ] , hparams . n_embd_k_s ( ) * n_seqs , hparams . n_embd_k_s ( ) * kv_head * ggml_element_size ( kv_self . k_l [ il ] ) )
)
) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
if ( hparams . rescale_every_n_layers ! = 0 & & ( il + 1 ) % hparams . rescale_every_n_layers = = 0 ) {
cur = ggml_scale ( ctx0 , cur , 0.5F ) ;
2024-02-21 05:08:22 -08:00
}
2024-06-25 21:47:40 +01:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
2024-02-21 05:08:22 -08:00
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_reshape_2d ( ctx0 , cur , n_embd , n_tokens ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
2024-02-21 05:08:22 -08:00
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams , model . output_norm , model . output_norm_b , LLM_NORM , cb , - 1 ) ;
2024-02-21 05:08:22 -08:00
cb ( cur , " result_norm " , - 1 ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2024-02-21 05:08:22 -08:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2024-03-02 01:00:46 +05:30
2025-01-10 09:58:08 +08:00
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
ggml_cgraph * build_rwkv6qwen2 ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2025-01-10 09:58:08 +08:00
GGML_ASSERT ( n_embd = = hparams . n_embd_k_s ( ) ) ;
const int64_t n_seqs = ubatch . n_seqs ;
const int64_t n_seq_tokens = ubatch . n_seq_tokens ;
const int64_t n_tokens = ubatch . n_tokens ;
GGML_ASSERT ( n_seqs ! = 0 ) ;
GGML_ASSERT ( ubatch . equal_seqs ) ;
GGML_ASSERT ( n_tokens = = n_seq_tokens * n_seqs ) ;
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
struct ggml_tensor * state_copy = build_inp_s_copy ( ) ;
struct ggml_tensor * state_mask = build_inp_s_mask ( ) ;
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
const llama_layer * layer = & model . layers [ il ] ;
// (ab)using the KV cache to store the states
struct ggml_tensor * token_shift = llm_build_copy_mask_state ( ctx0 ,
gf , kv_self . k_l [ il ] , state_copy , state_mask ,
hparams . n_embd_k_s ( ) , kv_self . size , kv_head , n_kv , n_seqs ) ;
struct ggml_tensor * wkv_states = llm_build_copy_mask_state ( ctx0 ,
gf , kv_self . v_l [ il ] , state_copy , state_mask ,
hparams . n_embd_v_s ( ) , kv_self . size , kv_head , n_kv , n_seqs ) ;
cur = ggml_reshape_3d ( ctx0 , inpL , n_embd , n_seq_tokens , n_seqs ) ;
token_shift = ggml_reshape_3d ( ctx0 , token_shift , n_embd , 1 , n_seqs ) ;
struct ggml_tensor * x_norm_att = llm_build_norm ( ctx0 , cur , hparams , layer - > attn_norm , layer - > attn_norm_b , LLM_NORM_RMS , cb , il ) ;
struct ggml_tensor * x_prev = ggml_concat (
ctx0 ,
token_shift ,
ggml_view_3d ( ctx0 , x_norm_att , n_embd , n_seq_tokens - 1 , n_seqs , x_norm_att - > nb [ 1 ] , x_norm_att - > nb [ 2 ] , 0 ) ,
1
) ;
2025-01-29 12:07:21 +08:00
struct ggml_tensor * last_norm_att = ggml_view_3d ( ctx0 , x_norm_att , n_embd , 1 , n_seqs , x_norm_att - > nb [ 1 ] , x_norm_att - > nb [ 2 ] , ( n_seq_tokens - 1 ) * n_embd * ggml_element_size ( x_norm_att ) ) ;
2025-01-10 09:58:08 +08:00
ggml_build_forward_expand (
gf ,
ggml_cpy (
ctx0 ,
2025-01-29 12:07:21 +08:00
ggml_view_1d ( ctx0 , last_norm_att , n_embd * n_seqs , 0 ) ,
ggml_view_1d ( ctx0 , kv_self . k_l [ il ] , hparams . n_embd_k_s ( ) * n_seqs , hparams . n_embd_k_s ( ) * kv_head * ggml_element_size ( kv_self . k_l [ il ] ) )
2025-01-10 09:58:08 +08:00
)
) ;
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , llm_build_rwkv6_time_mix ( lctx , ctx0 , layer , x_norm_att , x_prev , & wkv_states , hparams . wkv_head_size , hparams . n_head_kv ( ) ) ) ;
ggml_build_forward_expand ( gf , ffn_inp ) ;
ggml_build_forward_expand (
gf ,
ggml_cpy (
ctx0 ,
wkv_states ,
ggml_view_1d (
ctx0 ,
kv_self . v_l [ il ] ,
hparams . n_embd_v_s ( ) * n_seqs ,
hparams . n_embd_v_s ( ) * kv_head * ggml_element_size ( kv_self . v_l [ il ] )
)
)
) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
// feed-forward network
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
cur = ggml_reshape_2d ( ctx0 , cur , n_embd , n_tokens ) ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
cur = llm_build_norm ( ctx0 , cur , hparams , model . output_norm , model . output_norm_b , LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
// ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes:
// * qk-norm
// * swin-norm
// * removed bias
// * removed MoE
struct ggml_cgraph * build_chameleon ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this - > n_tokens ;
const int64_t n_embd_head = hparams . n_embd_head_v ;
GGML_ASSERT ( n_embd_head = = hparams . n_embd_head_k ) ;
GGML_ASSERT ( n_embd_head = = hparams . n_rot ) ;
2024-06-28 00:00:43 -04:00
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-06-28 00:00:43 -04:00
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos ( ) ;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2025-01-03 10:18:53 +02:00
struct ggml_tensor * KQ_mask = build_inp_KQ_mask ( ) ;
2024-06-28 00:00:43 -04:00
for ( int il = 0 ; il < n_layer ; + + il ) {
2025-01-03 10:18:53 +02:00
struct ggml_tensor * inpSA = inpL ;
2024-07-01 18:48:34 +02:00
2024-06-28 00:00:43 -04:00
// norm
2025-01-03 10:18:53 +02:00
if ( hparams . swin_norm ) {
cur = inpL ;
} else {
cur = llm_build_norm ( ctx0 , inpL , hparams ,
2024-06-28 00:00:43 -04:00
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " attn_norm " , il ) ;
}
2024-06-28 00:00:43 -04:00
// self-attention
{
// compute Q and K and RoPE them
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Qcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wq , cur ) ;
2024-06-28 00:00:43 -04:00
cb ( Qcur , " Qcur " , il ) ;
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Kcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wk , cur ) ;
2024-06-28 00:00:43 -04:00
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
struct ggml_tensor * Vcur = llm_build_lora_mm ( lctx , ctx0 , model . layers [ il ] . wv , cur ) ;
2024-06-28 00:00:43 -04:00
cb ( Vcur , " Vcur " , il ) ;
2025-01-03 10:18:53 +02:00
if ( model . layers [ il ] . attn_q_norm ) {
Qcur = ggml_view_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ,
ggml_element_size ( Qcur ) * n_embd_head ,
ggml_element_size ( Qcur ) * n_embd_head * n_head ,
0 ) ;
cb ( Qcur , " Qcur " , il ) ;
Qcur = llm_build_norm ( ctx0 , Qcur , hparams ,
model . layers [ il ] . attn_q_norm ,
model . layers [ il ] . attn_q_norm_b ,
LLM_NORM , cb , il ) ;
cb ( Qcur , " Qcur " , il ) ;
}
if ( model . layers [ il ] . attn_k_norm ) {
Kcur = ggml_view_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ,
ggml_element_size ( Kcur ) * n_embd_head ,
ggml_element_size ( Kcur ) * n_embd_head * n_head_kv ,
0 ) ;
cb ( Kcur , " Kcur " , il ) ;
Kcur = llm_build_norm ( ctx0 , Kcur , hparams ,
model . layers [ il ] . attn_k_norm ,
model . layers [ il ] . attn_k_norm_b ,
LLM_NORM , cb , il ) ;
cb ( Kcur , " Kcur " , il ) ;
}
2024-06-28 00:00:43 -04:00
Qcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Qcur , n_embd_head , n_head , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2024-06-28 00:00:43 -04:00
cb ( Qcur , " Qcur " , il ) ;
Kcur = ggml_rope_ext (
2025-01-03 10:18:53 +02:00
ctx0 , ggml_reshape_3d ( ctx0 , Kcur , n_embd_head , n_head_kv , n_tokens ) , inp_pos , nullptr ,
n_rot , rope_type , n_ctx_orig , freq_base , freq_scale ,
ext_factor , attn_factor , beta_fast , beta_slow
) ;
2024-06-28 00:00:43 -04:00
cb ( Kcur , " Kcur " , il ) ;
2024-07-15 20:50:47 +02:00
cur = llm_build_kv ( ctx0 , lctx , kv_self , gf ,
2025-01-03 10:18:53 +02:00
model . layers [ il ] . wo , nullptr ,
Kcur , Vcur , Qcur , KQ_mask , n_tokens , kv_head , n_kv , 1.0f / sqrtf ( float ( n_embd_head ) ) , cb , il ) ;
2024-06-28 00:00:43 -04:00
2025-01-03 10:18:53 +02:00
if ( hparams . swin_norm ) {
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . attn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
}
}
2024-06-28 00:00:43 -04:00
if ( il = = n_layer - 1 ) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids ( ) ;
2025-01-03 10:18:53 +02:00
n_tokens = n_outputs ;
cur = ggml_get_rows ( ctx0 , cur , inp_out_ids ) ;
inpSA = ggml_get_rows ( ctx0 , inpSA , inp_out_ids ) ;
2024-06-28 00:00:43 -04:00
}
2025-01-03 10:18:53 +02:00
struct ggml_tensor * ffn_inp = ggml_add ( ctx0 , cur , inpSA ) ;
cb ( ffn_inp , " ffn_inp " , il ) ;
2024-06-28 00:00:43 -04:00
// feed-forward network
2025-01-03 10:18:53 +02:00
if ( ! hparams . swin_norm ) {
cur = llm_build_norm ( ctx0 , ffn_inp , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
2024-06-28 00:00:43 -04:00
}
2025-01-03 10:18:53 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
model . layers [ il ] . ffn_up , NULL , NULL ,
model . layers [ il ] . ffn_gate , NULL , NULL ,
model . layers [ il ] . ffn_down , NULL , NULL ,
NULL ,
LLM_FFN_SILU , LLM_FFN_PAR , cb , il ) ;
cb ( cur , " ffn_out " , il ) ;
if ( hparams . swin_norm ) {
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . layers [ il ] . ffn_norm , NULL ,
LLM_NORM_RMS , cb , il ) ;
cb ( cur , " ffn_norm " , il ) ;
}
cur = ggml_add ( ctx0 , cur , ffn_inp ) ;
cb ( cur , " ffn_out " , il ) ;
2024-06-28 00:00:43 -04:00
cur = lctx . cvec . apply_to ( ctx0 , cur , il ) ;
cb ( cur , " l_out " , il ) ;
// input for next layer
inpL = cur ;
}
cur = inpL ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . output_norm , NULL ,
LLM_NORM_RMS , cb , - 1 ) ;
cb ( cur , " result_norm " , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2025-01-03 10:18:53 +02:00
cb ( cur , " result_output_with_img_logits " , - 1 ) ;
2024-06-29 20:44:08 -07:00
2025-01-03 10:18:53 +02:00
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
// Needs to be removed once image outputs are supported.
int img_token_end_idx = 8196 ;
int img_token_start_idx = 4 ;
int num_img_tokens = img_token_end_idx - img_token_start_idx ;
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
// which ensures that text token values are always at least larger than image token values
struct ggml_tensor * img_logits = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_F32 , num_img_tokens ) ;
img_logits = ggml_clamp ( ctx0 , img_logits , - FLT_MAX , - FLT_MAX ) ;
cb ( img_logits , " img_logits " , - 1 ) ;
cur = ggml_set_1d ( ctx0 , cur , img_logits , ggml_element_size ( cur ) * img_token_start_idx ) ;
2024-06-28 00:00:43 -04:00
cb ( cur , " result_output " , - 1 ) ;
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * build_wavtokenizer_dec ( ) {
2025-01-12 11:32:42 +02:00
struct ggml_cgraph * gf = ggml_new_graph_custom ( ctx0 , model . max_nodes ( ) , false ) ;
2024-03-02 01:00:46 +05:30
struct ggml_tensor * cur ;
struct ggml_tensor * inpL ;
2024-10-22 15:31:06 +02:00
inpL = llm_build_inp_embd ( ctx0 , lctx , hparams , ubatch , model . tok_embd , cb ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , inpL ) ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_conv_1d_ph ( ctx0 , model . conv1d , cur , 1 , 1 ) ;
cur = ggml_add ( ctx0 , cur , model . conv1d_b ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
// posnet
for ( uint32_t il = 0 ; il < hparams . posnet . n_layer ; + + il ) {
const auto & layer = model . layers [ il ] . posnet ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
inpL = cur ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
switch ( il ) {
case 0 :
case 1 :
case 3 :
case 4 :
{
cur = llm_build_norm ( ctx0 , cur , hparams ,
layer . norm1 ,
layer . norm1_b ,
LLM_NORM_GROUP , cb , 0 ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_mul ( ctx0 , ggml_sigmoid ( ctx0 , cur ) , cur ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_conv_1d_ph ( ctx0 , layer . conv1 , cur , 1 , 1 ) ;
cur = ggml_add ( ctx0 , cur , layer . conv1_b ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = llm_build_norm ( ctx0 , cur , hparams ,
layer . norm2 ,
layer . norm2_b ,
LLM_NORM_GROUP , cb , 0 ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_mul ( ctx0 , ggml_sigmoid ( ctx0 , cur ) , cur ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_conv_1d_ph ( ctx0 , layer . conv2 , cur , 1 , 1 ) ;
cur = ggml_add ( ctx0 , cur , layer . conv2_b ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , inpL ) ;
} break ;
case 2 :
{
cur = llm_build_norm ( ctx0 , cur , hparams ,
layer . attn_norm ,
layer . attn_norm_b ,
LLM_NORM_GROUP , cb , 0 ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_tensor * q ;
struct ggml_tensor * k ;
struct ggml_tensor * v ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
q = ggml_conv_1d_ph ( ctx0 , layer . attn_q , cur , 1 , 1 ) ;
k = ggml_conv_1d_ph ( ctx0 , layer . attn_k , cur , 1 , 1 ) ;
v = ggml_conv_1d_ph ( ctx0 , layer . attn_v , cur , 1 , 1 ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
q = ggml_add ( ctx0 , q , layer . attn_q_b ) ;
k = ggml_add ( ctx0 , k , layer . attn_k_b ) ;
v = ggml_add ( ctx0 , v , layer . attn_v_b ) ;
q = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , q ) ) ;
k = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , k ) ) ;
struct ggml_tensor * kq = ggml_mul_mat ( ctx0 , k , q ) ;
kq = ggml_soft_max_ext ( ctx0 , kq , nullptr , 1.0f / sqrtf ( float ( hparams . posnet . n_embd ) ) , 0.0f ) ;
cur = ggml_mul_mat ( ctx0 , kq , v ) ;
cur = ggml_conv_1d_ph ( ctx0 , layer . attn_o , cur , 1 , 1 ) ;
cur = ggml_add ( ctx0 , cur , layer . attn_o_b ) ;
cur = ggml_add ( ctx0 , cur , inpL ) ;
} break ;
case 5 :
{
cur = llm_build_norm ( ctx0 , cur , hparams ,
layer . norm ,
layer . norm_b ,
LLM_NORM_GROUP , cb , 0 ) ;
} break ;
default : GGML_ABORT ( " unknown posnet layer " ) ;
} ;
}
cur = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , cur ) ) ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
model . tok_norm ,
model . tok_norm_b ,
LLM_NORM , cb , - 1 ) ;
cur = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , cur ) ) ;
inpL = cur ;
// convnext
for ( uint32_t il = 0 ; il < hparams . convnext . n_layer ; + + il ) {
const auto & layer = model . layers [ il ] . convnext ;
cur = inpL ;
cur = ggml_conv_1d_dw_ph ( ctx0 , layer . dw , cur , 1 , 1 ) ;
cur = ggml_add ( ctx0 , cur , layer . dw_b ) ;
cur = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , cur ) ) ;
cur = llm_build_norm ( ctx0 , cur , hparams ,
layer . norm ,
layer . norm_b ,
LLM_NORM , cb , - 1 ) ;
2024-03-02 01:00:46 +05:30
2024-07-15 20:50:47 +02:00
cur = llm_build_ffn ( ctx0 , lctx , cur ,
2025-01-03 10:18:53 +02:00
layer . pw1 , layer . pw1_b , NULL ,
NULL , NULL , NULL ,
layer . pw2 , layer . pw2_b , NULL ,
NULL ,
LLM_FFN_GELU , LLM_FFN_SEQ , cb , il ) ;
2024-06-25 21:47:40 +01:00
2025-01-03 10:18:53 +02:00
cur = ggml_mul ( ctx0 , cur , layer . gamma ) ;
2024-03-02 01:00:46 +05:30
2025-01-03 10:18:53 +02:00
cur = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , cur ) ) ;
inpL = ggml_add ( ctx0 , cur , inpL ) ;
2024-03-02 01:00:46 +05:30
}
cur = inpL ;
2025-01-03 10:18:53 +02:00
cur = ggml_cont ( ctx0 , ggml_transpose ( ctx0 , cur ) ) ;
2024-03-02 01:00:46 +05:30
cur = llm_build_norm ( ctx0 , cur , hparams ,
2025-01-03 10:18:53 +02:00
model . output_norm ,
model . output_norm_b ,
2024-03-02 01:00:46 +05:30
LLM_NORM , cb , - 1 ) ;
// lm_head
2024-07-15 20:50:47 +02:00
cur = llm_build_lora_mm ( lctx , ctx0 , model . output , cur ) ;
2025-01-03 10:18:53 +02:00
cur = ggml_add ( ctx0 , cur , model . output_b ) ;
cb ( cur , " result_embd " , - 1 ) ;
2024-03-02 01:00:46 +05:30
ggml_build_forward_expand ( gf , cur ) ;
return gf ;
}
2025-01-03 10:18:53 +02:00
} ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
static struct ggml_cgraph * llama_build_graph_defrag ( llama_context & lctx , const std : : vector < uint32_t > & ids ) {
llama_ubatch dummy = { } ;
dummy . equal_seqs = true ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
llm_build_cb cb = [ & ] ( struct ggml_tensor * , const char * , int ) { } ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
struct llm_build_context llm ( lctx , dummy , cb , false ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
llm . init ( ) ;
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * result = llm . build_defrag ( ids ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
llm . free ( ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
return result ;
}
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
static struct ggml_cgraph * llama_build_graph_k_shift ( llama_context & lctx ) {
llama_ubatch dummy = { } ;
dummy . equal_seqs = true ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
llm_build_cb cb = [ & ] ( struct ggml_tensor * , const char * , int ) { } ;
2024-04-09 09:16:13 +01:00
2025-01-03 10:18:53 +02:00
struct llm_build_context llm ( lctx , dummy , cb , false ) ;
2024-04-09 09:16:13 +01:00
2025-01-03 10:18:53 +02:00
llm . init ( ) ;
2024-04-09 09:16:13 +01:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * result = llm . build_k_shift ( ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
llm . free ( ) ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
return result ;
}
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
static struct ggml_cgraph * llama_build_graph (
llama_context & lctx ,
const llama_ubatch & ubatch ,
bool worst_case ) {
const auto & model = lctx . model ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
llm_build_cb cb = [ & ] ( struct ggml_tensor * cur , const char * name , int il ) {
if ( il > = 0 ) {
ggml_format_name ( cur , " %s-%d " , name , il ) ;
} else {
ggml_set_name ( cur , name ) ;
}
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
if ( ! lctx . cparams . offload_kqv ) {
if ( strcmp ( name , " kqv_merged_cont " ) = = 0 ) {
// all nodes between the KV store and the attention output are run on the CPU
ggml_backend_sched_set_tensor_backend ( lctx . sched . get ( ) , cur , lctx . backend_cpu ) ;
2024-03-15 16:41:22 -04:00
}
}
2025-01-03 10:18:53 +02:00
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
2025-01-12 11:32:42 +02:00
const bool full_offload = lctx . model . params . n_gpu_layers > ( int ) lctx . model . hparams . n_layer ;
2025-01-03 10:18:53 +02:00
if ( ubatch . n_tokens < 32 | | full_offload ) {
if ( il ! = - 1 & & strcmp ( name , " norm " ) = = 0 ) {
2025-01-12 11:32:42 +02:00
const auto & dev_layer = lctx . model . dev_layer ( il ) ;
2025-01-03 10:18:53 +02:00
for ( auto & backend : lctx . backends ) {
2025-01-12 11:32:42 +02:00
if ( ggml_backend_get_device ( backend . get ( ) ) = = dev_layer ) {
2025-01-03 10:18:53 +02:00
if ( ggml_backend_supports_op ( backend . get ( ) , cur ) ) {
ggml_backend_sched_set_tensor_backend ( lctx . sched . get ( ) , cur , backend . get ( ) ) ;
}
}
}
}
2024-03-15 16:41:22 -04:00
}
2025-01-03 10:18:53 +02:00
} ;
2024-03-15 16:41:22 -04:00
2025-01-03 10:18:53 +02:00
struct ggml_cgraph * result = NULL ;
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
struct llm_build_context llm ( lctx , ubatch , cb , worst_case ) ;
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
llm . init ( ) ;
2024-04-19 09:35:54 +00:00
2025-01-03 10:18:53 +02:00
switch ( model . arch ) {
case LLM_ARCH_LLAMA :
case LLM_ARCH_MINICPM :
case LLM_ARCH_GRANITE :
case LLM_ARCH_GRANITE_MOE :
2024-09-15 23:47:37 -07:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_llama ( ) ;
} break ;
case LLM_ARCH_DECI :
2024-07-05 05:14:21 +12:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_deci ( ) ;
} break ;
case LLM_ARCH_BAICHUAN :
2024-07-05 05:14:21 +12:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_baichuan ( ) ;
} break ;
case LLM_ARCH_FALCON :
2024-05-23 11:49:53 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_falcon ( ) ;
} break ;
case LLM_ARCH_GROK :
2024-05-24 14:31:13 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_grok ( ) ;
} break ;
case LLM_ARCH_STARCODER :
2024-12-16 00:02:46 +07:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_starcoder ( ) ;
} break ;
case LLM_ARCH_REFACT :
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_refact ( ) ;
} break ;
case LLM_ARCH_BERT :
case LLM_ARCH_JINA_BERT_V2 :
case LLM_ARCH_NOMIC_BERT :
2024-06-24 02:27:57 +08:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_bert ( ) ;
} break ;
case LLM_ARCH_BLOOM :
2024-08-10 11:43:26 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_bloom ( ) ;
} break ;
case LLM_ARCH_MPT :
2024-08-10 11:43:26 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_mpt ( ) ;
} break ;
case LLM_ARCH_STABLELM :
2024-08-10 11:43:26 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_stablelm ( ) ;
} break ;
case LLM_ARCH_QWEN :
2024-08-10 11:43:26 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_qwen ( ) ;
} break ;
case LLM_ARCH_QWEN2 :
2024-08-10 11:43:26 +02:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_qwen2 ( ) ;
} break ;
case LLM_ARCH_QWEN2VL :
2024-07-02 10:36:00 -04:00
{
2025-01-03 10:18:53 +02:00
lctx . n_pos_per_token = 4 ;
result = llm . build_qwen2vl ( ) ;
} break ;
case LLM_ARCH_QWEN2MOE :
2024-07-02 10:36:00 -04:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_qwen2moe ( ) ;
} break ;
case LLM_ARCH_PHI2 :
2024-07-07 20:52:10 +08:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_phi2 ( ) ;
} break ;
case LLM_ARCH_PHI3 :
2025-01-09 11:21:41 +01:00
case LLM_ARCH_PHIMOE :
2024-07-07 20:52:10 +08:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_phi3 ( ) ;
} break ;
case LLM_ARCH_PLAMO :
2024-08-15 19:23:33 -07:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_plamo ( ) ;
} break ;
case LLM_ARCH_GPT2 :
2024-08-16 15:35:18 +09:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_gpt2 ( ) ;
} break ;
case LLM_ARCH_CODESHELL :
2024-09-28 12:08:43 +00:00
{
2025-01-03 10:18:53 +02:00
result = llm . build_codeshell ( ) ;
} break ;
case LLM_ARCH_ORION :
{
result = llm . build_orion ( ) ;
} break ;
case LLM_ARCH_INTERNLM2 :
{
result = llm . build_internlm2 ( ) ;
} break ;
case LLM_ARCH_MINICPM3 :
{
result = llm . build_minicpm3 ( ) ;
} break ;
case LLM_ARCH_GEMMA :
{
result = llm . build_gemma ( ) ;
} break ;
case LLM_ARCH_GEMMA2 :
{
result = llm . build_gemma2 ( ) ;
} break ;
case LLM_ARCH_STARCODER2 :
{
result = llm . build_starcoder2 ( ) ;
} break ;
case LLM_ARCH_MAMBA :
{
result = llm . build_mamba ( ) ;
} break ;
case LLM_ARCH_XVERSE :
{
result = llm . build_xverse ( ) ;
} break ;
case LLM_ARCH_COMMAND_R :
{
result = llm . build_command_r ( ) ;
} break ;
2025-01-04 09:33:31 -05:00
case LLM_ARCH_COHERE2 :
{
result = llm . build_cohere2 ( ) ;
} break ;
2025-01-03 10:18:53 +02:00
case LLM_ARCH_DBRX :
{
result = llm . build_dbrx ( ) ;
} break ;
case LLM_ARCH_OLMO :
{
result = llm . build_olmo ( ) ;
} break ;
case LLM_ARCH_OLMO2 :
{
result = llm . build_olmo2 ( ) ;
} break ;
case LLM_ARCH_OLMOE :
{
result = llm . build_olmoe ( ) ;
} break ;
case LLM_ARCH_OPENELM :
{
result = llm . build_openelm ( ) ;
} break ;
case LLM_ARCH_GPTNEOX :
{
result = llm . build_gptneox ( ) ;
} break ;
case LLM_ARCH_ARCTIC :
{
result = llm . build_arctic ( ) ;
} break ;
case LLM_ARCH_DEEPSEEK :
{
result = llm . build_deepseek ( ) ;
} break ;
case LLM_ARCH_DEEPSEEK2 :
{
result = llm . build_deepseek2 ( ) ;
} break ;
case LLM_ARCH_CHATGLM :
{
result = llm . build_chatglm ( ) ;
} break ;
case LLM_ARCH_BITNET :
{
result = llm . build_bitnet ( ) ;
} break ;
case LLM_ARCH_T5 :
{
if ( lctx . is_encoding ) {
result = llm . build_t5_enc ( ) ;
} else {
result = llm . build_t5_dec ( ) ;
}
} break ;
case LLM_ARCH_T5ENCODER :
{
result = llm . build_t5_enc ( ) ;
} break ;
case LLM_ARCH_JAIS :
{
result = llm . build_jais ( ) ;
} break ;
case LLM_ARCH_NEMOTRON :
{
result = llm . build_nemotron ( ) ;
} break ;
case LLM_ARCH_EXAONE :
{
result = llm . build_exaone ( ) ;
} break ;
case LLM_ARCH_RWKV6 :
{
result = llm . build_rwkv6 ( ) ;
} break ;
2025-01-10 09:58:08 +08:00
case LLM_ARCH_RWKV6QWEN2 :
{
result = llm . build_rwkv6qwen2 ( ) ;
} break ;
2025-01-03 10:18:53 +02:00
case LLM_ARCH_CHAMELEON :
{
result = llm . build_chameleon ( ) ;
} break ;
case LLM_ARCH_WAVTOKENIZER_DEC :
{
result = llm . build_wavtokenizer_dec ( ) ;
} break ;
default :
GGML_ABORT ( " fatal error " ) ;
2024-10-30 02:01:23 +01:00
}
2025-01-03 10:18:53 +02:00
// add on pooling layer
if ( lctx . cparams . embeddings ) {
result = llm . append_pooling ( result ) ;
2023-06-24 11:47:58 +03:00
}
2025-01-03 10:18:53 +02:00
llm . free ( ) ;
2023-06-24 11:47:58 +03:00
2025-01-03 10:18:53 +02:00
return result ;
2023-06-24 11:47:58 +03:00
}
2025-01-03 10:18:53 +02:00
// returns the result of ggml_backend_sched_graph_compute_async execution
static enum ggml_status llama_graph_compute (
llama_context & lctx ,
ggml_cgraph * gf ,
int n_threads ,
ggml_threadpool * threadpool ) {
if ( lctx . backend_cpu ! = nullptr ) {
auto * reg = ggml_backend_dev_backend_reg ( ggml_backend_get_device ( lctx . backend_cpu ) ) ;
auto * set_threadpool_fn = ( decltype ( ggml_backend_cpu_set_threadpool ) * ) ggml_backend_reg_get_proc_address ( reg , " ggml_backend_cpu_set_threadpool " ) ;
set_threadpool_fn ( lctx . backend_cpu , threadpool ) ;
2023-11-01 18:04:33 -04:00
}
2025-01-03 10:18:53 +02:00
// set the number of threads for all the backends
for ( const auto & set_n_threads_fn : lctx . set_n_threads_fns ) {
set_n_threads_fn . second ( set_n_threads_fn . first , n_threads ) ;
2023-11-01 18:04:33 -04:00
}
2023-09-28 21:42:38 +02:00
2025-01-03 10:18:53 +02:00
auto status = ggml_backend_sched_graph_compute_async ( lctx . sched . get ( ) , gf ) ;
if ( status ! = GGML_STATUS_SUCCESS ) {
LLAMA_LOG_ERROR ( " %s: ggml_backend_sched_graph_compute_async failed with error %d \n " , __func__ , status ) ;
2024-03-03 04:40:27 -06:00
}
2025-01-03 10:18:53 +02:00
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
2024-07-05 02:05:56 -05:00
2025-01-03 10:18:53 +02:00
return status ;
}
2024-11-02 15:18:56 +02:00
2025-01-27 12:07:12 +01:00
static int llama_prepare_sbatch (
llama_context & lctx ,
const llama_batch & batch ,
uint32_t & n_outputs ) {
2025-01-03 10:18:53 +02:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
const auto & cparams = lctx . cparams ;
2024-06-13 03:11:35 +02:00
2025-01-27 12:07:12 +01:00
const uint32_t n_tokens_all = batch . n_tokens ;
const int64_t n_embd = hparams . n_embd ;
2025-01-03 10:18:53 +02:00
2025-01-27 12:07:12 +01:00
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
const bool embd_pooled = cparams . embeddings & & cparams . pooling_type ! = LLAMA_POOLING_TYPE_NONE ;
GGML_ASSERT ( ( ! batch . token & & batch . embd ) | | ( batch . token & & ! batch . embd ) ) ; // NOLINT
2025-01-03 10:18:53 +02:00
if ( batch . token ) {
for ( uint32_t i = 0 ; i < n_tokens_all ; + + i ) {
2025-01-27 12:07:12 +01:00
if ( batch . token [ i ] < 0 | | uint32_t ( batch . token [ i ] ) > = model . vocab . n_tokens ( ) ) {
2025-01-03 10:18:53 +02:00
LLAMA_LOG_ERROR ( " %s: invalid token[%d] = %d \n " , __func__ , i , batch . token [ i ] ) ;
return - 1 ;
2024-10-07 21:55:08 +02:00
}
2024-06-13 03:11:35 +02:00
}
2025-01-03 10:18:53 +02:00
}
GGML_ASSERT ( n_tokens_all < = cparams . n_batch ) ;
GGML_ASSERT ( ( cparams . causal_attn | | cparams . n_ubatch > = n_tokens_all ) & & " non-causal attention requires n_ubatch >= n_tokens " ) ;
2024-10-07 21:55:08 +02:00
2025-01-03 10:18:53 +02:00
lctx . n_queued_tokens + = n_tokens_all ;
lctx . embd_seq . clear ( ) ;
// count outputs
if ( batch . logits & & ! embd_pooled ) {
for ( uint32_t i = 0 ; i < n_tokens_all ; + + i ) {
n_outputs + = batch . logits [ i ] ! = 0 ;
2023-03-24 23:17:37 +02:00
}
2025-01-03 10:18:53 +02:00
} else if ( lctx . logits_all | | embd_pooled ) {
n_outputs = n_tokens_all ;
} else {
// keep last output only
n_outputs = 1 ;
}
2023-03-24 23:17:37 +02:00
2025-01-03 10:18:53 +02:00
lctx . sbatch . from_batch ( batch , n_embd ,
2025-01-27 12:07:12 +01:00
/* simple_split */ ! lctx . kv_self . recurrent ,
2025-01-03 10:18:53 +02:00
/* logits_all */ n_outputs = = n_tokens_all ) ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
// reserve output buffer
if ( llama_output_reserve ( lctx , n_outputs ) < n_outputs ) {
LLAMA_LOG_ERROR ( " %s: could not reserve space for batch with %u outputs \n " , __func__ , n_outputs ) ;
return - 2 ;
} ;
2025-01-27 12:07:12 +01:00
return 0 ;
}
static int llama_prepare_ubatch (
llama_context & lctx ,
llama_kv_slot_restorer & kv_slot_restorer ,
llama_ubatch & ubatch ,
const uint32_t n_outputs ,
const uint32_t n_tokens_all ) {
GGML_ASSERT ( lctx . sbatch . n_tokens > 0 ) ;
auto & kv_self = lctx . kv_self ;
const auto & cparams = lctx . cparams ;
const auto & hparams = lctx . model . hparams ;
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
const bool embd_pooled = cparams . embeddings & & cparams . pooling_type ! = LLAMA_POOLING_TYPE_NONE ;
if ( lctx . kv_self . recurrent ) {
if ( embd_pooled ) {
// Pooled embeddings cannot be split across ubatches (yet)
ubatch = lctx . sbatch . split_seq ( cparams . n_ubatch ) ;
2025-01-03 10:18:53 +02:00
} else {
2025-01-27 12:07:12 +01:00
// recurrent model architectures are easier to implement
// with equal-length sequences
ubatch = lctx . sbatch . split_equal ( cparams . n_ubatch ) ;
2024-01-24 12:48:14 +01:00
}
2025-01-27 12:07:12 +01:00
} else {
ubatch = lctx . sbatch . split_simple ( cparams . n_ubatch ) ;
}
2024-01-24 12:48:14 +01:00
2025-01-27 12:07:12 +01:00
// count the outputs in this u_batch
{
int32_t n_outputs_new = 0 ;
2025-01-03 10:18:53 +02:00
2025-01-27 12:07:12 +01:00
if ( n_outputs = = n_tokens_all ) {
n_outputs_new = ubatch . n_tokens ;
} else {
GGML_ASSERT ( ubatch . output ) ;
for ( uint32_t i = 0 ; i < ubatch . n_tokens ; i + + ) {
n_outputs_new + = int32_t ( ubatch . output [ i ] ! = 0 ) ;
2024-01-12 20:07:38 +01:00
}
2025-01-27 12:07:12 +01:00
}
// needs to happen before the graph is built
lctx . n_outputs = n_outputs_new ;
}
// non-causal masks do not use the KV cache
if ( hparams . causal_attn ) {
llama_kv_cache_update ( & lctx ) ;
2024-01-12 20:07:38 +01:00
2025-01-27 12:07:12 +01:00
// if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it
if ( kv_self . head > kv_self . used + 2 * ubatch . n_tokens ) {
kv_self . head = 0 ;
2025-01-03 10:18:53 +02:00
}
2023-07-30 15:58:01 +02:00
2025-01-27 12:07:12 +01:00
const auto slot = llama_kv_cache_find_slot ( kv_self , ubatch ) ;
if ( ! slot ) {
return 1 ;
}
kv_slot_restorer . save ( slot ) ;
if ( ! kv_self . recurrent ) {
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
const uint32_t pad = llama_kv_cache_get_padding ( cparams ) ;
kv_self . n = std : : min ( kv_self . size , std : : max ( pad , GGML_PAD ( llama_kv_cache_cell_max ( kv_self ) , pad ) ) ) ;
//kv_self.n = llama_kv_cache_cell_max(kv_self);
}
}
2024-10-03 01:49:47 +02:00
2025-01-27 12:07:12 +01:00
return 0 ;
}
2024-10-03 01:49:47 +02:00
2025-01-27 12:07:12 +01:00
// decode a batch of tokens by evaluating the transformer
// in case of unsuccessful decoding (error or warning),
// the kv_cache state will be returned to its original state
// (for non-recurrent models) or cleaned (for recurrent models)
//
// - lctx: llama context
// - inp_batch: batch to evaluate
//
// return 0 on success
// return positive int on warning
// return negative int on error
//
static int llama_decode_impl (
llama_context & lctx ,
llama_batch inp_batch ) {
2024-03-13 18:54:21 +01:00
2025-01-27 12:07:12 +01:00
lctx . is_encoding = false ;
2023-07-30 15:58:01 +02:00
2025-01-27 12:07:12 +01:00
if ( inp_batch . n_tokens = = 0 ) {
LLAMA_LOG_ERROR ( " %s: n_tokens == 0 \n " , __func__ ) ;
return - 1 ;
}
// temporarily allocate memory for the input batch if needed
llama_batch_allocr batch_allocr ( inp_batch , inp_batch . pos ? - 1 : lctx . kv_self . max_pos ( ) + 1 ) ;
const llama_batch & batch = batch_allocr . batch ;
const auto & model = lctx . model ;
const auto & vocab = model . vocab ;
const auto & hparams = model . hparams ;
const auto & cparams = lctx . cparams ;
if ( lctx . t_compute_start_us = = 0 ) {
lctx . t_compute_start_us = ggml_time_us ( ) ;
}
auto & kv_self = lctx . kv_self ;
llama_kv_slot_restorer kv_slot_restorer ( kv_self ) ;
2023-09-28 19:04:36 +03:00
2025-01-27 12:07:12 +01:00
const int64_t n_embd = hparams . n_embd ;
const int64_t n_vocab = vocab . n_tokens ( ) ;
uint32_t n_outputs = 0 ;
uint32_t n_outputs_prev = 0 ;
{
const int ret = llama_prepare_sbatch ( lctx , batch , n_outputs ) ;
if ( ret ! = 0 ) {
return ret ;
}
}
while ( lctx . sbatch . n_tokens > 0 ) {
llama_ubatch ubatch ;
{
const int ret = llama_prepare_ubatch ( lctx , kv_slot_restorer , ubatch , n_outputs , batch . n_tokens ) ;
if ( ret ! = 0 ) {
return ret ;
2025-01-03 10:18:53 +02:00
}
}
2024-10-30 02:01:23 +01:00
2025-01-27 12:07:12 +01:00
const int n_threads = ubatch . n_tokens = = 1 ? cparams . n_threads : cparams . n_threads_batch ;
ggml_threadpool_t threadpool = ubatch . n_tokens = = 1 ? lctx . threadpool : lctx . threadpool_batch ;
GGML_ASSERT ( n_threads > 0 ) ;
2025-01-03 10:18:53 +02:00
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
ggml_backend_sched_set_eval_callback ( lctx . sched . get ( ) , lctx . cparams . cb_eval , lctx . cparams . cb_eval_user_data ) ;
2024-10-30 02:01:23 +01:00
2025-01-03 10:18:53 +02:00
ggml_cgraph * gf = llama_build_graph ( lctx , ubatch , false ) ;
2023-06-28 18:35:54 +02:00
2025-01-03 10:18:53 +02:00
// the output is always the last tensor in the graph
struct ggml_tensor * res = ggml_graph_node ( gf , - 1 ) ;
struct ggml_tensor * embd = ggml_graph_node ( gf , - 2 ) ;
2024-01-24 12:48:14 +01:00
2025-01-03 10:18:53 +02:00
if ( lctx . n_outputs = = 0 ) {
// no output
res = nullptr ;
embd = nullptr ;
} else if ( cparams . embeddings ) {
res = nullptr ; // do not extract logits for embedding case
embd = nullptr ;
for ( int i = ggml_graph_n_nodes ( gf ) - 1 ; i > = 0 ; - - i ) {
if ( strcmp ( ggml_graph_node ( gf , i ) - > name , " result_embd_pooled " ) = = 0 ) {
embd = ggml_graph_node ( gf , i ) ;
break ;
}
2024-10-30 02:01:23 +01:00
}
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( embd ! = nullptr & & " missing embeddings tensor " ) ;
} else {
embd = nullptr ; // do not extract embeddings when not needed
GGML_ASSERT ( strcmp ( res - > name , " result_output " ) = = 0 & & " missing result_output tensor " ) ;
2023-12-21 21:07:46 +01:00
}
2023-06-28 18:35:54 +02:00
2025-01-03 10:18:53 +02:00
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
2023-04-17 17:28:55 +02:00
2025-01-03 10:18:53 +02:00
ggml_backend_sched_alloc_graph ( lctx . sched . get ( ) , gf ) ;
2023-06-28 18:35:54 +02:00
2025-01-03 10:18:53 +02:00
llama_set_inputs ( lctx , ubatch ) ;
2023-07-08 00:24:01 +08:00
2025-01-03 10:18:53 +02:00
const auto compute_status = llama_graph_compute ( lctx , gf , n_threads , threadpool ) ;
if ( compute_status ! = GGML_STATUS_SUCCESS ) {
kv_slot_restorer . restore ( kv_self ) ;
switch ( compute_status ) {
case GGML_STATUS_ABORTED :
return 2 ;
case GGML_STATUS_ALLOC_FAILED :
return - 2 ;
case GGML_STATUS_FAILED :
default :
return - 3 ;
}
}
2023-12-21 11:57:48 -08:00
2025-01-03 10:18:53 +02:00
// update the kv ring buffer
{
2025-01-27 12:07:12 +01:00
kv_self . head + = ubatch . n_tokens ;
2024-03-13 18:54:21 +01:00
2025-01-03 10:18:53 +02:00
// Ensure kv cache head points to a valid index.
if ( kv_self . head > = kv_self . size ) {
kv_self . head = 0 ;
}
}
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
2023-04-17 17:28:55 +02:00
2025-01-03 10:18:53 +02:00
// extract logits
if ( res ) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend ( lctx . sched . get ( ) , res ) ;
GGML_ASSERT ( backend_res ! = nullptr ) ;
GGML_ASSERT ( lctx . logits ! = nullptr ) ;
2024-09-07 15:16:19 +03:00
2025-01-03 10:18:53 +02:00
float * logits_out = lctx . logits + n_outputs_prev * n_vocab ;
const int32_t n_outputs_new = lctx . n_outputs ;
2024-09-07 15:16:19 +03:00
2025-01-03 10:18:53 +02:00
if ( n_outputs_new ) {
GGML_ASSERT ( n_outputs_prev + n_outputs_new < = n_outputs ) ;
GGML_ASSERT ( ( n_outputs_prev + n_outputs_new ) * n_vocab < = ( int64_t ) lctx . logits_size ) ;
ggml_backend_tensor_get_async ( backend_res , res , logits_out , 0 , n_outputs_new * n_vocab * sizeof ( float ) ) ;
}
}
2024-09-07 15:16:19 +03:00
2025-01-03 10:18:53 +02:00
// extract embeddings
if ( embd ) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend ( lctx . sched . get ( ) , embd ) ;
GGML_ASSERT ( backend_embd ! = nullptr ) ;
2024-09-07 15:16:19 +03:00
2025-01-03 10:18:53 +02:00
switch ( cparams . pooling_type ) {
case LLAMA_POOLING_TYPE_NONE :
{
// extract token embeddings
GGML_ASSERT ( lctx . embd ! = nullptr ) ;
float * embd_out = lctx . embd + n_outputs_prev * n_embd ;
const int32_t n_outputs_new = lctx . n_outputs ;
2024-09-17 08:23:30 +02:00
2025-01-03 10:18:53 +02:00
if ( n_outputs_new ) {
GGML_ASSERT ( n_outputs_prev + n_outputs_new < = n_outputs ) ;
GGML_ASSERT ( ( n_outputs_prev + n_outputs_new ) * n_embd < = ( int64_t ) lctx . embd_size ) ;
ggml_backend_tensor_get_async ( backend_embd , embd , embd_out , 0 , n_outputs_new * n_embd * sizeof ( float ) ) ;
}
} break ;
case LLAMA_POOLING_TYPE_MEAN :
case LLAMA_POOLING_TYPE_CLS :
case LLAMA_POOLING_TYPE_LAST :
{
// extract sequence embeddings (cleared before processing each batch)
auto & embd_seq_out = lctx . embd_seq ;
2024-09-07 15:16:19 +03:00
2025-01-03 10:18:53 +02:00
for ( uint32_t s = 0 ; s < ubatch . n_seqs ; + + s ) {
const llama_seq_id seq_id = ubatch . seq_id [ s ] [ 0 ] ;
if ( embd_seq_out . find ( seq_id ) ! = embd_seq_out . end ( ) ) {
continue ;
}
embd_seq_out [ seq_id ] . resize ( n_embd ) ;
ggml_backend_tensor_get_async ( backend_embd , embd , embd_seq_out [ seq_id ] . data ( ) , ( n_embd * seq_id ) * sizeof ( float ) , n_embd * sizeof ( float ) ) ;
}
} break ;
case LLAMA_POOLING_TYPE_RANK :
{
// extract the rerank score - a single float per sequence
auto & embd_seq_out = lctx . embd_seq ;
2024-09-07 15:16:19 +03:00
2025-01-03 10:18:53 +02:00
for ( uint32_t s = 0 ; s < ubatch . n_seqs ; + + s ) {
const llama_seq_id seq_id = ubatch . seq_id [ s ] [ 0 ] ;
if ( embd_seq_out . find ( seq_id ) ! = embd_seq_out . end ( ) ) {
continue ;
}
embd_seq_out [ seq_id ] . resize ( 1 ) ;
ggml_backend_tensor_get_async ( backend_embd , embd , embd_seq_out [ seq_id ] . data ( ) , ( seq_id ) * sizeof ( float ) , sizeof ( float ) ) ;
}
} break ;
case LLAMA_POOLING_TYPE_UNSPECIFIED :
{
GGML_ABORT ( " unknown pooling type " ) ;
}
}
}
n_outputs_prev + = lctx . n_outputs ;
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
// set output mappings
{
bool sorted_output = true ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( lctx . sbatch . out_ids . size ( ) = = n_outputs ) ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
for ( size_t i = 0 ; i < n_outputs ; + + i ) {
size_t out_id = lctx . sbatch . out_ids [ i ] ;
lctx . output_ids [ out_id ] = i ;
if ( out_id ! = i ) {
sorted_output = false ;
}
}
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
2025-01-03 10:18:53 +02:00
if ( sorted_output ) {
lctx . sbatch . out_ids . clear ( ) ;
}
2024-02-25 22:12:24 +02:00
}
2025-01-03 10:18:53 +02:00
// set to total number of outputs in the batch, for use in llama_get_logits_ith
lctx . n_outputs = n_outputs ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
// wait for the computation to finish (automatically done when obtaining the model output)
//llama_synchronize(&lctx);
2023-10-03 10:09:28 -07:00
2025-01-03 10:18:53 +02:00
// decide if we need to defrag the kv cache
if ( cparams . causal_attn & & cparams . defrag_thold > = 0.0f ) {
const float fragmentation = kv_self . n > = 128 ? 1.0f - float ( kv_self . used ) / float ( kv_self . n ) : 0.0f ;
2023-11-17 16:17:37 +01:00
2025-01-03 10:18:53 +02:00
// queue defragmentation for next llama_kv_cache_update
if ( fragmentation > cparams . defrag_thold ) {
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
2023-11-17 16:17:37 +01:00
2025-01-03 10:18:53 +02:00
llama_kv_cache_defrag ( kv_self ) ;
2023-11-17 16:17:37 +01:00
}
}
2025-01-03 10:18:53 +02:00
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
return 0 ;
2023-11-17 16:17:37 +01:00
}
2025-01-03 10:18:53 +02:00
// encode a batch of tokens by evaluating the encoder part of the transformer
//
// - lctx: llama context
// - batch: batch to evaluate
//
// return 0 on success
// return positive int on warning
// return negative int on error
//
2025-01-06 10:52:01 +02:00
static int llama_encode_impl (
2025-01-03 10:18:53 +02:00
llama_context & lctx ,
llama_batch inp_batch ) {
lctx . is_encoding = true ;
if ( inp_batch . n_tokens = = 0 ) {
LLAMA_LOG_ERROR ( " %s: n_tokens == 0 \n " , __func__ ) ;
2023-11-17 16:17:37 +01:00
return - 1 ;
}
2025-01-03 10:18:53 +02:00
// temporary allocate memory for the input batch if needed
llama_batch_allocr batch_allocr ( inp_batch , inp_batch . pos ? - 1 : lctx . kv_self . max_pos ( ) + 1 ) ;
2023-04-17 17:28:55 +02:00
2025-01-03 10:18:53 +02:00
const llama_batch & batch = batch_allocr . batch ;
const uint32_t n_tokens = batch . n_tokens ;
2023-08-25 15:16:19 +02:00
2025-01-03 10:18:53 +02:00
const auto & model = lctx . model ;
const auto & hparams = model . hparams ;
const auto & cparams = lctx . cparams ;
2023-08-25 15:16:19 +02:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( ( ! batch . token & & batch . embd ) | | ( batch . token & & ! batch . embd ) ) ; // NOLINT
2024-08-10 11:43:26 +02:00
2025-01-03 10:18:53 +02:00
if ( batch . token ) {
for ( uint32_t i = 0 ; i < n_tokens ; + + i ) {
2025-01-12 11:32:42 +02:00
if ( batch . token [ i ] < 0 | | ( uint32_t ) batch . token [ i ] > = model . vocab . n_tokens ( ) ) {
2025-01-03 10:18:53 +02:00
LLAMA_LOG_ERROR ( " %s: invalid token[%d] = %d \n " , __func__ , i , batch . token [ i ] ) ;
return - 1 ;
}
}
2024-07-04 15:46:11 +02:00
}
2025-01-03 10:18:53 +02:00
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
GGML_ASSERT ( cparams . n_ubatch > = n_tokens & & " encoder requires n_ubatch >= n_tokens " ) ;
2024-07-04 15:46:11 +02:00
2025-01-03 10:18:53 +02:00
if ( lctx . t_compute_start_us = = 0 ) {
lctx . t_compute_start_us = ggml_time_us ( ) ;
2024-08-21 17:58:11 -04:00
}
2025-01-03 10:18:53 +02:00
lctx . n_queued_tokens + = n_tokens ;
2023-04-17 17:28:55 +02:00
2025-01-03 10:18:53 +02:00
const int64_t n_embd = hparams . n_embd ;
2023-04-17 17:28:55 +02:00
2025-01-03 10:18:53 +02:00
lctx . sbatch . from_batch ( batch , n_embd , /* simple_split */ true , /* logits_all */ true ) ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
const llama_ubatch ubatch = lctx . sbatch . split_simple ( n_tokens ) ;
// reserve output buffer
if ( llama_output_reserve ( lctx , n_tokens ) < n_tokens ) {
LLAMA_LOG_ERROR ( " %s: could not reserve space for batch with %u outputs \n " , __func__ , n_tokens ) ;
return - 2 ;
2024-10-30 02:01:23 +01:00
} ;
2025-01-03 10:18:53 +02:00
for ( uint32_t i = 0 ; i < n_tokens ; + + i ) {
lctx . output_ids [ i ] = i ;
2024-03-15 13:43:02 -07:00
}
2025-01-03 10:18:53 +02:00
lctx . inp_embd_enc = NULL ;
lctx . n_outputs = n_tokens ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
int n_threads = n_tokens = = 1 ? cparams . n_threads : cparams . n_threads_batch ;
ggml_threadpool_t threadpool = n_tokens = = 1 ? lctx . threadpool : lctx . threadpool_batch ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( n_threads > 0 ) ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
ggml_backend_sched_set_eval_callback ( lctx . sched . get ( ) , lctx . cparams . cb_eval , lctx . cparams . cb_eval_user_data ) ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
ggml_cgraph * gf = llama_build_graph ( lctx , ubatch , false ) ;
// the output embeddings after the final encoder normalization
struct ggml_tensor * embd = nullptr ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
// there are two cases here
if ( llama_model_has_decoder ( & lctx . model ) ) {
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
embd = ggml_graph_node ( gf , - 1 ) ;
GGML_ASSERT ( strcmp ( embd - > name , " result_norm " ) = = 0 & & " missing result_output tensor " ) ;
} else {
// second case is an encoder-only T5 model
if ( cparams . embeddings ) {
// only output embeddings if required
embd = ggml_graph_node ( gf , - 1 ) ;
if ( strcmp ( embd - > name , " result_embd_pooled " ) ! = 0 ) {
embd = ggml_graph_node ( gf , - 2 ) ;
}
GGML_ASSERT ( strcmp ( embd - > name , " result_embd_pooled " ) = = 0 & & " missing embeddings tensor " ) ;
2024-03-15 13:43:02 -07:00
}
}
2025-01-03 10:18:53 +02:00
ggml_backend_sched_alloc_graph ( lctx . sched . get ( ) , gf ) ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
llama_set_inputs ( lctx , ubatch ) ;
2024-03-15 13:43:02 -07:00
2025-01-03 10:18:53 +02:00
const auto compute_status = llama_graph_compute ( lctx , gf , n_threads , threadpool ) ;
switch ( compute_status ) {
case GGML_STATUS_SUCCESS :
break ;
case GGML_STATUS_ABORTED :
return 2 ;
case GGML_STATUS_ALLOC_FAILED :
return - 2 ;
case GGML_STATUS_FAILED :
default :
return - 3 ;
2024-03-15 13:43:02 -07:00
}
2025-01-03 10:18:53 +02:00
// extract embeddings
if ( embd ) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend ( lctx . sched . get ( ) , embd ) ;
GGML_ASSERT ( backend_embd ! = nullptr ) ;
2023-11-23 19:07:56 +02:00
2025-01-03 10:18:53 +02:00
if ( llama_model_has_decoder ( & lctx . model ) ) {
lctx . embd_enc . resize ( n_tokens * n_embd ) ;
float * embd_out = lctx . embd_enc . data ( ) ;
2023-11-23 19:07:56 +02:00
2025-01-03 10:18:53 +02:00
ggml_backend_tensor_get_async ( backend_embd , embd , embd_out , 0 , n_tokens * n_embd * sizeof ( float ) ) ;
GGML_ASSERT ( ! ubatch . equal_seqs ) ; // TODO: handle equal splits
2023-11-23 19:07:56 +02:00
2025-01-03 10:18:53 +02:00
// remember the sequence ids used during the encoding - needed for cross attention later
lctx . seq_ids_enc . resize ( n_tokens ) ;
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
for ( int s = 0 ; s < ubatch . n_seq_id [ i ] ; s + + ) {
llama_seq_id seq_id = ubatch . seq_id [ i ] [ s ] ;
lctx . seq_ids_enc [ i ] . insert ( seq_id ) ;
}
2023-11-23 19:07:56 +02:00
}
2025-01-03 10:18:53 +02:00
} else {
GGML_ASSERT ( lctx . embd ! = nullptr ) ;
2023-11-23 19:07:56 +02:00
2025-01-03 10:18:53 +02:00
switch ( cparams . pooling_type ) {
case LLAMA_POOLING_TYPE_NONE :
{
// extract token embeddings
GGML_ASSERT ( lctx . embd ! = nullptr ) ;
float * embd_out = lctx . embd ;
GGML_ASSERT ( n_tokens * n_embd < = ( int64_t ) lctx . embd_size ) ;
ggml_backend_tensor_get_async ( backend_embd , embd , embd_out , 0 , n_tokens * n_embd * sizeof ( float ) ) ;
} break ;
case LLAMA_POOLING_TYPE_MEAN :
case LLAMA_POOLING_TYPE_CLS :
case LLAMA_POOLING_TYPE_LAST :
{
// extract sequence embeddings
auto & embd_seq_out = lctx . embd_seq ;
embd_seq_out . clear ( ) ;
GGML_ASSERT ( ! ubatch . equal_seqs ) ; // TODO: handle equal splits
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
const llama_seq_id seq_id = ubatch . seq_id [ i ] [ 0 ] ;
if ( embd_seq_out . find ( seq_id ) ! = embd_seq_out . end ( ) ) {
continue ;
}
embd_seq_out [ seq_id ] . resize ( n_embd ) ;
ggml_backend_tensor_get_async ( backend_embd , embd , embd_seq_out [ seq_id ] . data ( ) , ( n_embd * seq_id ) * sizeof ( float ) , n_embd * sizeof ( float ) ) ;
}
} break ;
case LLAMA_POOLING_TYPE_RANK :
{
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
// wait for an encoder model that requires this pooling type in order to test it
// https://github.com/ggerganov/llama.cpp/pull/9510
GGML_ABORT ( " RANK pooling not implemented yet " ) ;
}
case LLAMA_POOLING_TYPE_UNSPECIFIED :
{
GGML_ABORT ( " unknown pooling type " ) ;
}
2023-11-23 19:07:56 +02:00
}
}
}
2025-01-03 10:18:53 +02:00
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
2023-11-23 19:07:56 +02:00
2025-01-03 10:18:53 +02:00
return 0 ;
2023-11-23 19:07:56 +02:00
}
2025-01-03 10:18:53 +02:00
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
2025-01-06 10:52:01 +02:00
static void llama_kv_cache_defrag_impl ( struct llama_context & lctx ) {
2025-01-03 10:18:53 +02:00
auto & kv_self = lctx . kv_self ;
2023-04-02 12:23:04 +02:00
2025-01-03 10:18:53 +02:00
const auto & hparams = lctx . model . hparams ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
const uint32_t n_layer = hparams . n_layer ;
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
const uint32_t n_kv = llama_kv_cache_cell_max ( kv_self ) ;
const uint32_t n_used = kv_self . used ;
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
assert ( n_used < = n_kv ) ;
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
//const int64_t t_start = ggml_time_us();
2024-01-08 11:14:04 +02:00
2025-01-03 10:18:53 +02:00
// number of cells moved
uint32_t n_moves = 0 ;
2023-04-26 20:08:43 +00:00
2025-01-03 10:18:53 +02:00
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
2025-01-12 11:32:42 +02:00
//const uint32_t max_moves = model.max_nodes()/(6*n_layer);
2025-01-03 10:18:53 +02:00
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
2025-01-12 11:32:42 +02:00
const uint32_t max_moves = ( lctx . model . max_nodes ( ) - 2 * n_layer ) / ( 6 * n_layer ) ;
2024-01-08 11:14:04 +02:00
2025-01-03 10:18:53 +02:00
// determine which KV cells to move where
//
// cell i moves to ids[i]
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
std : : vector < uint32_t > ids ( n_kv , n_kv ) ;
2024-01-08 11:14:04 +02:00
2025-01-03 10:18:53 +02:00
for ( uint32_t i0 = 0 ; i0 < n_used ; + + i0 ) {
const auto & cell0 = kv_self . cells [ i0 ] ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
if ( ! cell0 . is_empty ( ) ) {
ids [ i0 ] = i0 ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
continue ;
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
// found a hole - fill it with data from the end of the cache
2024-11-19 13:29:26 +02:00
2025-01-03 10:18:53 +02:00
uint32_t nh = 1 ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// determine the size of the hole
while ( i0 + nh < n_used & & kv_self . cells [ i0 + nh ] . is_empty ( ) ) {
nh + + ;
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
uint32_t nf = 0 ;
uint32_t is = n_kv - 1 ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// starting from the end, find nh non-empty cells
for ( ; is > i0 ; - - is ) {
const auto & cell1 = kv_self . cells [ is ] ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
if ( cell1 . is_empty ( ) | | ids [ is ] ! = n_kv ) {
continue ;
}
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
// non-empty cell which is not yet moved
nf + + ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
if ( nf = = nh ) {
break ;
}
}
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
// this can only happen if `n_used` is not accurate, which would be a bug
GGML_ASSERT ( nf = = nh & & " KV defrag bug: nf != nh " ) ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
nf = 0 ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
uint32_t i1 = is ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
// are we moving a continuous block of memory?
bool cont = false ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
// should we stop searching for the next move?
bool stop = false ;
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
// go back and move the nf cells to the hole
for ( ; i1 < n_kv ; + + i1 ) {
auto & cell1 = kv_self . cells [ i1 ] ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
if ( cell1 . is_empty ( ) | | ids [ i1 ] ! = n_kv ) {
if ( n_moves = = max_moves ) {
stop = true ;
break ;
}
2023-08-21 23:07:43 +03:00
2025-01-03 10:18:53 +02:00
cont = false ;
continue ;
}
2024-04-25 17:59:03 +02:00
2025-01-03 10:18:53 +02:00
// this cell goes to (i0 + nf)
ids [ i1 ] = i0 + nf ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// move the cell meta data
kv_self . cells [ i0 + nf ] = cell1 ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// clear the old cell and move the head there
cell1 = llama_kv_cell ( ) ;
kv_self . head = n_used ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
if ( ! cont ) {
n_moves + + ;
cont = true ;
2024-07-28 00:42:05 -04:00
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
nf + + ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
if ( nf = = nh ) {
break ;
}
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
if ( stop | | n_moves = = max_moves ) {
break ;
2024-07-28 00:42:05 -04:00
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
i0 + = nh - 1 ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
if ( n_moves = = 0 ) {
return ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
#if 0
// CPU defrag
//
// TODO: optimizations are possible:
// - multiple threads
// - avoid copying to the host memory when already there
//
// likely not worth the effort, as we have ggml_graph based defrag
//
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
const uint32_t n_embd_k_gqa = hparams . n_embd_k_gqa ( ) ;
const uint32_t n_embd_v_gqa = hparams . n_embd_v_gqa ( ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
const uint32_t kv_size = kv_self . size ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
std : : vector < uint8_t > buf_k ;
std : : vector < uint8_t > buf_v ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
for ( uint32_t il = 0 ; il < n_layer ; + + il ) {
const size_t k_size_row = ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa ) ;
const size_t k_size = ggml_row_size ( kv_self . k_l [ il ] - > type , n_embd_k_gqa * kv_size ) ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
const size_t v_size_el = ggml_type_size ( kv_self . v_l [ il ] - > type ) ;
const size_t v_size = ggml_row_size ( kv_self . v_l [ il ] - > type , n_embd_v_gqa * kv_size ) ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
buf_k . resize ( k_size ) ;
buf_v . resize ( v_size ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
ggml_backend_tensor_get ( kv_self . k_l [ il ] , buf_k . data ( ) , 0 , buf_k . size ( ) ) ;
ggml_backend_tensor_get ( kv_self . v_l [ il ] , buf_v . data ( ) , 0 , buf_v . size ( ) ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
// batch move [i, i+nm) to [id, id+nm)
// note: cells can move only to a lower index
for ( uint32_t i = 0 ; i < n_kv ; + + i ) {
const uint32_t id = ids [ i ] ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
if ( i = = id | | id = = n_kv ) {
continue ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
uint32_t nm = 1 ;
2023-08-04 19:29:52 +08:00
2025-01-03 10:18:53 +02:00
while ( i + nm < n_kv & & ids [ i + nm ] = = id + nm ) {
nm + + ;
2024-07-28 00:42:05 -04:00
}
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
2025-01-03 10:18:53 +02:00
// move keys
{
const int64_t os = i * k_size_row ;
const int64_t od = id * k_size_row ;
2024-02-25 22:12:24 +02:00
2025-01-03 10:18:53 +02:00
memcpy ( buf_k . data ( ) + od , buf_k . data ( ) + os , nm * k_size_row ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// move values (note: they are transposed)
{
const int64_t os = i ;
const int64_t od = id ;
2024-07-28 00:42:05 -04:00
for ( uint32_t j = 0 ; j < n_embd_v_gqa ; + + j ) {
2025-01-03 10:18:53 +02:00
memcpy ( buf_v . data ( ) + ( od + j * kv_size ) * v_size_el , buf_v . data ( ) + ( os + j * kv_size ) * v_size_el , nm * v_size_el ) ;
2024-01-12 20:07:38 +01:00
}
2023-12-07 13:03:17 +02:00
}
2023-10-03 21:04:01 +03:00
2025-01-03 10:18:53 +02:00
i + = nm - 1 ;
2024-07-28 00:42:05 -04:00
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
ggml_backend_tensor_set ( kv_self . k_l [ il ] , buf_k . data ( ) , 0 , buf_k . size ( ) ) ;
ggml_backend_tensor_set ( kv_self . v_l [ il ] , buf_v . data ( ) , 0 , buf_v . size ( ) ) ;
}
# else
// ggml_graph defrag
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
2023-04-02 12:23:04 +02:00
2025-01-03 10:18:53 +02:00
ggml_cgraph * gf = llama_build_graph_defrag ( lctx , ids ) ;
2024-04-25 17:59:03 +02:00
2025-01-03 10:18:53 +02:00
llama_graph_compute ( lctx , gf , lctx . cparams . n_threads , lctx . threadpool ) ;
# endif
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
//const int64_t t_end = ggml_time_us();
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
}
2025-01-06 10:52:01 +02:00
static void llama_kv_cache_update_impl ( struct llama_context & lctx ) {
2025-01-03 10:18:53 +02:00
bool need_reserve = false ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
if ( lctx . kv_self . has_shift ) {
if ( ! llama_kv_cache_can_shift ( & lctx ) ) {
GGML_ABORT ( " The current context does not support K-shift " ) ;
2024-07-28 00:42:05 -04:00
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
// apply K-shift if needed
if ( lctx . model . hparams . rope_type ! = LLAMA_ROPE_TYPE_NONE ) {
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
2024-01-13 17:29:43 +01:00
2025-01-03 10:18:53 +02:00
ggml_cgraph * gf = llama_build_graph_k_shift ( lctx ) ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
ggml_backend_sched_alloc_graph ( lctx . sched . get ( ) , gf ) ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
llama_set_k_shift ( lctx ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
llama_graph_compute ( lctx , gf , lctx . cparams . n_threads , lctx . threadpool ) ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
need_reserve = true ;
2024-07-28 00:42:05 -04:00
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
{
auto & kv_self = lctx . kv_self ;
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
2025-01-03 10:18:53 +02:00
kv_self . has_shift = false ;
2024-04-15 08:56:55 -04:00
2025-01-03 10:18:53 +02:00
for ( uint32_t i = 0 ; i < kv_self . size ; + + i ) {
kv_self . cells [ i ] . delta = 0 ;
}
llama : greatly reduce output buffer memory usage (#6122)
* llama : greatly reduce logits memory usage
* llama : more compact state saving and reloading
* llama : fix lctx.n_outputs not being set before building graph
* perplexity : adapt to the logits API changes
* perplexity : fix Winogrande, use correct logits for second choice start
The first logits used to evaluate the second choice were not from
the end of the common prefix; instead, they were the logits from the end
of the first choice. This has been corrected.
The previous implementation sometimes had outliers in the scores of
choices for some tasks, and the logic to skip choices words
in the log-likelihood evaluation probably was an attempt to reduce those,
but it was complex and didn't quite seem to be the right thing.
This is simpler now, and the outlier scores aren't there anymore.
* perplexity : normalize spaces and punctuation in Winogrande sentences
* llama : fix embedding conditions
* llama : fix llama_get_embeddings_ith when the resulting id is 0
* llama : fix wrong n_outputs in llama_set_inputs
A mismatch happened when using a smaller n_ubatch than n_batch and then using
llama_batch_get_one(). The decision of what n_outputs should be now almost
fully depends on how lctx.n_outputs is set in llama_decode_internal.
The conditions are simpler this way.
* llama : when saving the state, recalculate n_outputs
This ensures the correct number of outputs for the entire previous batch
is stored in the session file, even when n_ubatch is smaller than n_batch.
* llama : fix not-skipping outputs of non-causal models
* llama : fix running a batch with n_outputs == 0
It previously worked because lctx.inp_out_ids was not initialized,
so it pointed to some garbage address which was somehow still valid when I
ran my tests.
* llama : keep same graph topology even when n_outputs == 0
* ggml : saner ggml_can_repeat with empty tensors
* ggml : future-proof ggml_is_empty by using GGML_MAX_DIMS - 1
* ggml : do not multi-thread ops returning empty tensors
* ggml : make ggml_is_empty public and work with views
* llama : use a vector for ctx->output_ids
* llama : rework reallocation logic for llama_output_reserve
Now comparing the actual size with the new total size of the output buffer
to allow more efficient enabling and disabling of the embeddings
and/or logits output in the future.
* ggml : skip empty tensors in all backends
* llama : fix llama_output_reserve nullptr deref when new_size is 0
* perplexity : make Winogrande work as it does on master
The problems with the Winogrande implementation will
need to be fixed in a separate PR to ease review.
* llama : clearer error messages for invalid logits or embeddings ids
* llama : assert all models that can have inp_out_ids
Since the graph topology is now constant, this presence check
can be done even when there are no outputs.
* llama : assert logits and embd buffers exist before writing to them
* llama : handle errors from llama_output_reserve at call sites
* perplexity : make hellaswag and multiple-choice outputs identical to master
Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.
This will probably be changed back in the future to make these benchmarks
a tiny bit faster.
* perplexity : fix division by zero when using less than 100 multiple-choice tasks
* llama : allow loading state saved with a different ctx size
When loading a session file, the context size is now only required to be
at least enough to load the KV cells contained in that session file,
instead of requiring to use exactly the same context size as when saving.
Doing this enables the use-case of extending or shrinking the context size
of a saved session.
This breaks existing session files because the meaning of kv_buf_size
is slightly changed (previously it was the size of the whole KV cache,
now it's only the size of the saved part of it). This allows for
finer-grained sanity checks when loading in an effort to keep kv_buf_size
useful even when the kv_size is changed.
* llama : minor
ggml-ci
* readme : update recent API changes, and warn about Vulkan
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-26 10:46:41 -04:00
}
}
2025-01-03 10:18:53 +02:00
// defragment the KV cache if needed
if ( lctx . kv_self . do_defrag ) {
2025-01-06 10:52:01 +02:00
llama_kv_cache_defrag_impl ( lctx ) ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
need_reserve = true ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
lctx . kv_self . do_defrag = false ;
2023-04-24 07:40:02 +03:00
}
2025-01-03 10:18:53 +02:00
// reserve a worst case graph again
if ( need_reserve ) {
// TODO: extract to a function
// build worst-case graph
uint32_t n_seqs = 1 ; // TODO: worst-case number of sequences
uint32_t n_tokens = std : : min ( lctx . cparams . n_ctx , lctx . cparams . n_ubatch ) ;
2025-01-12 11:32:42 +02:00
llama_token token = lctx . model . vocab . token_bos ( ) ; // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
2025-01-03 10:18:53 +02:00
llama_ubatch ubatch = { true , n_tokens , n_tokens / n_seqs , n_seqs , & token , nullptr , nullptr , nullptr , nullptr , nullptr } ;
ggml_cgraph * gf = llama_build_graph ( lctx , ubatch , true ) ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
// initialize scheduler with the worst-case graph
ggml_backend_sched_reset ( lctx . sched . get ( ) ) ;
if ( ! ggml_backend_sched_reserve ( lctx . sched . get ( ) , gf ) ) {
LLAMA_LOG_ERROR ( " %s: failed to allocate compute buffers \n " , __func__ ) ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
}
}
2023-04-24 07:40:02 +03:00
2025-01-12 11:32:42 +02:00
int32_t llama_set_adapter_lora (
2025-01-03 10:18:53 +02:00
struct llama_context * ctx ,
2025-01-12 11:32:42 +02:00
struct llama_adapter_lora * adapter ,
2025-01-03 10:18:53 +02:00
float scale ) {
2025-01-12 11:32:42 +02:00
ctx - > lora [ adapter ] = scale ;
2025-01-03 10:18:53 +02:00
return 0 ;
}
2025-01-12 11:32:42 +02:00
int32_t llama_rm_adapter_lora (
2025-01-03 10:18:53 +02:00
struct llama_context * ctx ,
2025-01-12 11:32:42 +02:00
struct llama_adapter_lora * adapter ) {
auto pos = ctx - > lora . find ( adapter ) ;
if ( pos ! = ctx - > lora . end ( ) ) {
ctx - > lora . erase ( pos ) ;
2025-01-03 10:18:53 +02:00
return 0 ;
2023-04-24 07:40:02 +03:00
}
2025-01-03 10:18:53 +02:00
return - 1 ;
}
2025-01-12 11:32:42 +02:00
void llama_clear_adapter_lora ( struct llama_context * ctx ) {
ctx - > lora . clear ( ) ;
2025-01-03 10:18:53 +02:00
}
2025-01-12 11:32:42 +02:00
int32_t llama_apply_adapter_cvec (
struct llama_context * ctx ,
2025-01-03 10:18:53 +02:00
const float * data ,
size_t len ,
int32_t n_embd ,
int32_t il_start ,
int32_t il_end ) {
2025-01-12 11:32:42 +02:00
return ctx - > cvec . apply ( ctx - > model , data , len , n_embd , il_start , il_end ) ;
2025-01-03 10:18:53 +02:00
}
2023-09-28 21:42:38 +02:00
2025-01-03 10:18:53 +02:00
//
// interface implementation
//
2023-05-02 22:26:13 -04:00
2025-01-03 10:18:53 +02:00
struct llama_context_params llama_context_default_params ( ) {
struct llama_context_params result = {
/*.n_ctx =*/ 512 ,
/*.n_batch =*/ 2048 ,
/*.n_ubatch =*/ 512 ,
/*.n_seq_max =*/ 1 ,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS , // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS ,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED ,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED ,
/*.rope_freq_base =*/ 0.0f ,
/*.rope_freq_scale =*/ 0.0f ,
/*.yarn_ext_factor =*/ - 1.0f ,
/*.yarn_attn_factor =*/ 1.0f ,
/*.yarn_beta_fast =*/ 32.0f ,
/*.yarn_beta_slow =*/ 1.0f ,
/*.yarn_orig_ctx =*/ 0 ,
/*.defrag_thold =*/ - 1.0f ,
/*.cb_eval =*/ nullptr ,
/*.cb_eval_user_data =*/ nullptr ,
/*.type_k =*/ GGML_TYPE_F16 ,
/*.type_v =*/ GGML_TYPE_F16 ,
/*.logits_all =*/ false ,
/*.embeddings =*/ false ,
/*.offload_kqv =*/ true ,
/*.flash_attn =*/ false ,
/*.no_perf =*/ true ,
/*.abort_callback =*/ nullptr ,
/*.abort_callback_data =*/ nullptr ,
} ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
return result ;
}
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
struct llama_sampler_chain_params llama_sampler_chain_default_params ( ) {
struct llama_sampler_chain_params result = {
/*.no_perf =*/ true ,
} ;
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
return result ;
}
ggml : add Flash Attention (#5021)
* ggml : add ggml_flash_attn_ext API
* ggml : fix GQA support in ggml_flash_attn_ext
* ggml : online attention (CPU)
* metal : initial implementation
* metal : f16 precision
* metal : reduce branches
* metal : specialize for head size
* wip : 8 rows per simd group
* wip : 4 rows per simd group
* wip : template for rows per warp
* metal : parallelize across KV size
* metal : parallel reduce across heads
* metal : efficient flash_attn_f16 implementation
* metal : avoid redundant loads of the attention
* metal : scale and mask in matrix form
* metal : fix comment
* llama : avoid ggml_cast, use F32 query
* metal : add parallel reduce version (disabled)
* metal : move output into local memory + optimize
- the result from each simdgroup now stays in the registers
- significantly reduced SRAM usage
- more efficient skipping of -INF blocks
- avoid simdgroup barrier in hot loop
- add comments
* metal : add tests, fix scaling, support C > 32
* metal : improve precision
* ggml : fix f16 mad
* metal : minor
* metal : support Q > 8
* tests : add ATTN tests
* metal : disable buffer allocation logs
* tests : more
* metal : faster inner loop for C == 32
* metal : fix array initialization
* tests : ifdef
* ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
* ggml : fix ggml_soft_max mask requirement
* cuda : fix soft_max to use correct mask size
* cuda : add flash_attn kernel (wip)
* metal : optimize softmax for C > 32
* metal : optimize softmax
* tests : minor fix
* cuda : avoid zeroing fragments
* tests : update dims
* cuda : fix __hisinf() result check
* cuda : avoid warp_reduce for smax
* cuda : use int instead of int64_t
Noticeably improves performance (thanks to Johannes)
* cuda : make loops use the same loop values
Thanks Johannes again for the tip
* cuda : unroll some of the loops
* cuda : avoid __hisinf branches
* cuda : use half2 in softmax
* cuda : switch to 1 warp for bs > 16
* cuda : speed-up reduce part of the kernel
* cuda : unroll Q*K^T loop
* cuda : fix -INF block check
* cuda : simplify softmax
* cuda : fix matrix names
* cuda : minor
* llama : adapt to F16 KQ_pos
* llama : adapt new models to F16 KQ_mask
* ggml : fix F16 store (ARM NEON)
* llama : fix type of KQ_mask and KQ_pos
* ggml : fix CPU soft_max
* tests : add hs=256
* cuda : fix build
* metal : improve perf via smaller int registers
* cuda : adapt soft_max to F16 mask and pos
* CUDA: faster FlashAttention, kernel for bs == 1
* 16 cols for Phi-2
* no vec for hs, no hs==256 ncols==32 for Volta
* adjust kernel selection logic
* 4 warps, 256 stride for all D
* no ncols == 64
* Multiple parallel blocks for batch size 1
* fix compile warnings
* fix excessive KQ_b loads
* fix cmake build
* fix KV cache padding, NaN from INFINITY (#6438)
* llama : flash_attn cparam + fix defrag
* server: support flash_attn param
* server: bench: enable flash_attn param
* CUDA: refactor host code, dyn. par. blocks
* fix flash_attn_vec_f16 race condition
* flush softmax exp below threshold to 0
* store temp KQ in registers
* Calculate KQ as FP32 if KQV has GGML_PREC_F32
* Add __hgt2_mask implementation for CUDA 11
* fix KQ FP32 precision fpr parallel_blocks > 1
* llama-bench : add -fa,--flash-attn arg
* metal : add BS=1 kernel for flash attention (#6508)
* metal : add BS=1 kernel for flash attention (wip)
* metal : support more than 1 warps
* metal : opts
* metal : opt
* metal : switch to parallel reduce
* metal : reduce registers
* metal : simplify
* metal : initial FA vec kernel
* metal : use F32 attention accumulators
* batched-bench : add fattn arg
* llama : simplify llama_build_kv_store
ggml-ci
* llama : adapt build_olmo to changes
* ggml : fix arm fp16 store on windows
* metal : clean-up
* metal : clean-up kernel code
* metal : minor
* tests : remove benchmarks
ggml-ci
* ggml : fix avx512 const correctness
ggml-ci
* ggml : fix soft_max with bias on CPU
ggml-ci
* common : print --flash-attn in help
* ggml : fix num dimensions in ggml_flash_attn_ext
* llama : force disable flash attention for incompatible models
* ggml : ggml_soft_max support F16/F32 mask/pos
ggml-ci
* cuda : uint -> uint32_t
* cuda : "constexpr dim3" -> "const dim3"
ggml-ci
* cuda : try to fix __hgt2_mask
ggml-ci
* ggml : add TODO's for F16/F32 mask/pos support in other backends
* llama : replace bool need_kq_pos with use_alibi
* llama : prep ALiBi support for BERT models
ggml-ci
* llama : fix n_batch requirements
ggml-ci
* cont
* server : add help for --flash-attn arg
* llama : disable FA for AMD
* tests : remove TMP_ATTN_BENCH
ggml-ci
* llama : support save/load state with FA enabled
ggml-ci
* ci : add CUDA save-load-state tests
ggml-ci
* llama : llama_kv_cache_clear zeroes data + fix save-load seq
ggml-ci
* llama : fix copy-paste errors, add TODO
* llama : disallow incompatible states
* llama : update llama_state_get_size after v_trans field
* metal : remove tmp log
* llama : add static reminder for llama_state_get_size
* metal : fix max nsg
ggml-ci
* ci : fix arg order
ggml-ci
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
2024-04-30 12:16:08 +03:00
2025-01-03 10:18:53 +02:00
size_t llama_max_devices ( void ) {
return 16 ;
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
bool llama_supports_mmap ( void ) {
return llama_mmap : : SUPPORTED ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
bool llama_supports_mlock ( void ) {
return llama_mlock : : SUPPORTED ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
bool llama_supports_gpu_offload ( void ) {
return ggml_backend_dev_by_type ( GGML_BACKEND_DEVICE_TYPE_GPU ) ! = nullptr | |
llama_supports_rpc ( ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
bool llama_supports_rpc ( void ) {
return ggml_backend_reg_by_name ( " RPC " ) ! = nullptr ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
void llama_backend_init ( void ) {
ggml_time_init ( ) ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0 , NULL , false } ;
struct ggml_context * ctx = ggml_init ( params ) ;
ggml_free ( ctx ) ;
}
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
void llama_numa_init ( enum ggml_numa_strategy numa ) {
if ( numa ! = GGML_NUMA_STRATEGY_DISABLED ) {
auto * dev = ggml_backend_dev_by_type ( GGML_BACKEND_DEVICE_TYPE_CPU ) ;
GGML_ASSERT ( dev & & " CPU backend is not loaded " ) ;
auto * reg = ggml_backend_dev_backend_reg ( dev ) ;
auto * numa_init_fn = ( decltype ( ggml_numa_init ) * ) ggml_backend_reg_get_proc_address ( reg , " ggml_backend_cpu_numa_init " ) ;
numa_init_fn ( numa ) ;
}
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
void llama_backend_free ( void ) {
ggml_quantize_free ( ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
int64_t llama_time_us ( void ) {
return ggml_time_us ( ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-16 13:54:08 +01:00
static struct llama_model * llama_model_load_from_file_impl (
const std : : string & path_model ,
std : : vector < std : : string > & splits ,
2025-01-06 10:55:18 +02:00
struct llama_model_params params ) {
2025-01-03 10:18:53 +02:00
ggml_time_init ( ) ;
2024-07-28 00:42:05 -04:00
2025-01-12 11:32:42 +02:00
llama_model * model = new llama_model ( params ) ;
2024-08-21 17:58:11 -04:00
2025-01-03 10:18:53 +02:00
unsigned cur_percentage = 0 ;
if ( params . progress_callback = = NULL ) {
params . progress_callback_user_data = & cur_percentage ;
params . progress_callback = [ ] ( float progress , void * ctx ) {
unsigned * cur_percentage_p = ( unsigned * ) ctx ;
unsigned percentage = ( unsigned ) ( 100 * progress ) ;
while ( percentage > * cur_percentage_p ) {
* cur_percentage_p = percentage ;
LLAMA_LOG_CONT ( " . " ) ;
if ( percentage > = 100 ) {
LLAMA_LOG_CONT ( " \n " ) ;
2024-07-28 00:42:05 -04:00
}
}
2025-01-03 10:18:53 +02:00
return true ;
} ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// create list of devices to use with this model
if ( params . devices ) {
for ( ggml_backend_dev_t * dev = params . devices ; * dev ; + + dev ) {
model - > devices . push_back ( * dev ) ;
}
} else {
2025-01-26 23:20:34 +08:00
std : : vector < ggml_backend_dev_t > rpc_servers ;
2025-01-03 10:18:53 +02:00
// use all available devices
for ( size_t i = 0 ; i < ggml_backend_dev_count ( ) ; + + i ) {
ggml_backend_dev_t dev = ggml_backend_dev_get ( i ) ;
switch ( ggml_backend_dev_type ( dev ) ) {
case GGML_BACKEND_DEVICE_TYPE_CPU :
case GGML_BACKEND_DEVICE_TYPE_ACCEL :
// skip CPU backends since they are handled separately
break ;
2023-10-03 21:04:01 +03:00
2025-01-03 10:18:53 +02:00
case GGML_BACKEND_DEVICE_TYPE_GPU :
2025-01-26 23:20:34 +08:00
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg ( dev ) ;
if ( ggml_backend_reg_name ( reg ) = = std : : string ( " RPC " ) ) {
rpc_servers . push_back ( dev ) ;
} else {
model - > devices . push_back ( dev ) ;
}
2025-01-03 10:18:53 +02:00
break ;
2024-07-28 00:42:05 -04:00
}
}
2025-01-26 23:20:34 +08:00
// add RPC servers at the front of the list
if ( ! rpc_servers . empty ( ) ) {
model - > devices . insert ( model - > devices . begin ( ) , rpc_servers . begin ( ) , rpc_servers . end ( ) ) ;
}
2024-07-28 00:42:05 -04:00
}
2023-10-03 21:04:01 +03:00
2025-01-03 10:18:53 +02:00
// if using single GPU mode, remove all except the main GPU
if ( params . split_mode = = LLAMA_SPLIT_MODE_NONE ) {
if ( params . main_gpu < 0 | | params . main_gpu > = ( int ) model - > devices . size ( ) ) {
LLAMA_LOG_ERROR ( " %s: invalid value for main_gpu: %d (available devices: %d) \n " , __func__ , params . main_gpu , ( int ) model - > devices . size ( ) ) ;
2025-01-06 10:55:18 +02:00
llama_model_free ( model ) ;
2025-01-03 10:18:53 +02:00
return nullptr ;
}
ggml_backend_dev_t main_gpu = model - > devices [ params . main_gpu ] ;
model - > devices . clear ( ) ;
model - > devices . push_back ( main_gpu ) ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
for ( auto * dev : model - > devices ) {
size_t free , total ; // NOLINT
ggml_backend_dev_memory ( dev , & free , & total ) ;
LLAMA_LOG_INFO ( " %s: using device %s (%s) - %zu MiB free \n " , __func__ , ggml_backend_dev_name ( dev ) , ggml_backend_dev_description ( dev ) , free / 1024 / 1024 ) ;
2024-08-08 23:54:00 -04:00
}
2025-01-16 13:54:08 +01:00
const int status = llama_model_load ( path_model , splits , * model , params ) ;
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( status < = 0 ) ;
if ( status < 0 ) {
if ( status = = - 1 ) {
LLAMA_LOG_ERROR ( " %s: failed to load model \n " , __func__ ) ;
} else if ( status = = - 2 ) {
LLAMA_LOG_INFO ( " %s: cancelled model load \n " , __func__ ) ;
}
2025-01-06 10:55:18 +02:00
llama_model_free ( model ) ;
2025-01-03 10:18:53 +02:00
return nullptr ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
return model ;
}
2024-07-28 00:42:05 -04:00
2025-01-16 13:54:08 +01:00
// deprecated
struct llama_model * llama_load_model_from_file (
const char * path_model ,
struct llama_model_params params ) {
return llama_model_load_from_file ( path_model , params ) ;
}
struct llama_model * llama_model_load_from_file (
const char * path_model ,
struct llama_model_params params ) {
std : : vector < std : : string > splits = { } ;
return llama_model_load_from_file_impl ( path_model , splits , params ) ;
}
struct llama_model * llama_model_load_from_splits (
const char * * paths ,
size_t n_paths ,
struct llama_model_params params ) {
std : : vector < std : : string > splits ;
if ( n_paths = = 0 ) {
LLAMA_LOG_ERROR ( " %s: list of splits is empty \n " , __func__ ) ;
return nullptr ;
}
for ( size_t i = 0 ; i < n_paths ; + + i ) {
splits . push_back ( paths [ i ] ) ;
}
return llama_model_load_from_file_impl ( splits . front ( ) , splits , params ) ;
}
2025-01-12 11:32:42 +02:00
struct llama_context * llama_init_from_model (
2025-01-03 10:18:53 +02:00
struct llama_model * model ,
struct llama_context_params params ) {
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
if ( ! model ) {
LLAMA_LOG_ERROR ( " %s: model cannot be NULL \n " , __func__ ) ;
return nullptr ;
2023-04-24 07:40:02 +03:00
}
2025-01-03 10:18:53 +02:00
if ( params . n_batch = = 0 & & params . n_ubatch = = 0 ) {
LLAMA_LOG_ERROR ( " %s: n_batch and n_ubatch cannot both be zero \n " , __func__ ) ;
return nullptr ;
2024-08-08 23:54:00 -04:00
}
2025-01-03 10:18:53 +02:00
if ( params . n_ctx = = 0 & & model - > hparams . n_ctx_train = = 0 ) {
LLAMA_LOG_ERROR ( " %s: n_ctx and model->hparams.n_ctx_train cannot both be zero \n " , __func__ ) ;
return nullptr ;
2024-07-28 00:42:05 -04:00
}
2023-04-24 07:40:02 +03:00
2025-01-03 10:18:53 +02:00
if ( params . flash_attn & & model - > arch = = LLM_ARCH_GROK ) {
LLAMA_LOG_WARN ( " %s: flash_attn is not compatible with Grok - forcing off \n " , __func__ ) ;
params . flash_attn = false ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
if ( params . flash_attn & & model - > hparams . n_embd_head_k ! = model - > hparams . n_embd_head_v ) {
LLAMA_LOG_WARN ( " %s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off \n " , __func__ ) ;
params . flash_attn = false ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
if ( ggml_is_quantized ( params . type_v ) & & ! params . flash_attn ) {
LLAMA_LOG_ERROR ( " %s: V cache quantization requires flash_attn \n " , __func__ ) ;
return nullptr ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
llama_context * ctx = new llama_context ( * model ) ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
const auto & hparams = model - > hparams ;
auto & cparams = ctx - > cparams ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
cparams . n_seq_max = std : : max ( 1u , params . n_seq_max ) ;
cparams . n_threads = params . n_threads ;
cparams . n_threads_batch = params . n_threads_batch ;
cparams . yarn_ext_factor = params . yarn_ext_factor ;
cparams . yarn_attn_factor = params . yarn_attn_factor ;
cparams . yarn_beta_fast = params . yarn_beta_fast ;
cparams . yarn_beta_slow = params . yarn_beta_slow ;
cparams . defrag_thold = params . defrag_thold ;
cparams . embeddings = params . embeddings ;
cparams . offload_kqv = params . offload_kqv ;
cparams . flash_attn = params . flash_attn ;
cparams . no_perf = params . no_perf ;
cparams . pooling_type = params . pooling_type ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
cparams . n_ctx = params . n_ctx = = 0 ? hparams . n_ctx_train : params . n_ctx ;
cparams . rope_freq_base = params . rope_freq_base = = 0.0f ? hparams . rope_freq_base_train : params . rope_freq_base ;
cparams . rope_freq_scale = params . rope_freq_scale = = 0.0f ? hparams . rope_freq_scale_train : params . rope_freq_scale ;
// this is necessary due to kv_self.n being padded later during inference
cparams . n_ctx = GGML_PAD ( cparams . n_ctx , llama_kv_cache_get_padding ( cparams ) ) ;
// with causal attention, the batch size is limited by the context size
cparams . n_batch = hparams . causal_attn ? std : : min ( cparams . n_ctx , params . n_batch ) : params . n_batch ;
2024-08-08 23:54:00 -04:00
2025-01-03 10:18:53 +02:00
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
if ( cparams . n_batch < GGML_KQ_MASK_PAD ) {
LLAMA_LOG_WARN ( " %s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d \n " , __func__ , GGML_KQ_MASK_PAD ) ;
cparams . n_batch = GGML_KQ_MASK_PAD ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
cparams . n_ubatch = std : : min ( cparams . n_batch , params . n_ubatch = = 0 ? params . n_batch : params . n_ubatch ) ;
cparams . n_ctx_orig_yarn = params . yarn_orig_ctx ! = 0 ? params . yarn_orig_ctx :
hparams . n_ctx_orig_yarn ! = 0 ? hparams . n_ctx_orig_yarn :
hparams . n_ctx_train ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
cparams . cb_eval = params . cb_eval ;
cparams . cb_eval_user_data = params . cb_eval_user_data ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
auto rope_scaling_type = params . rope_scaling_type ;
if ( rope_scaling_type = = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) {
rope_scaling_type = hparams . rope_scaling_type_train ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
if ( rope_scaling_type = = LLAMA_ROPE_SCALING_TYPE_NONE ) {
cparams . rope_freq_scale = 1.0f ; // never scale if scaling type is none
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
if ( cparams . yarn_ext_factor < 0.0f ) { // negative indicates 'not set'
cparams . yarn_ext_factor = rope_scaling_type = = LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
cparams . yarn_attn_factor * = hparams . rope_attn_factor ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
if ( cparams . pooling_type = = LLAMA_POOLING_TYPE_UNSPECIFIED ) {
if ( hparams . pooling_type = = LLAMA_POOLING_TYPE_UNSPECIFIED ) {
cparams . pooling_type = LLAMA_POOLING_TYPE_NONE ;
} else {
cparams . pooling_type = hparams . pooling_type ;
}
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
if ( params . attention_type = = LLAMA_ATTENTION_TYPE_UNSPECIFIED ) {
cparams . causal_attn = hparams . causal_attn ;
} else {
cparams . causal_attn = params . attention_type = = LLAMA_ATTENTION_TYPE_CAUSAL ;
2024-07-28 00:42:05 -04:00
}
2025-01-03 10:18:53 +02:00
const uint32_t n_ctx_per_seq = cparams . n_ctx / cparams . n_seq_max ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
LLAMA_LOG_INFO ( " %s: n_seq_max = %u \n " , __func__ , cparams . n_seq_max ) ;
LLAMA_LOG_INFO ( " %s: n_ctx = %u \n " , __func__ , cparams . n_ctx ) ;
LLAMA_LOG_INFO ( " %s: n_ctx_per_seq = %u \n " , __func__ , n_ctx_per_seq ) ;
LLAMA_LOG_INFO ( " %s: n_batch = %u \n " , __func__ , cparams . n_batch ) ;
LLAMA_LOG_INFO ( " %s: n_ubatch = %u \n " , __func__ , cparams . n_ubatch ) ;
LLAMA_LOG_INFO ( " %s: flash_attn = %d \n " , __func__ , cparams . flash_attn ) ;
LLAMA_LOG_INFO ( " %s: freq_base = %.1f \n " , __func__ , cparams . rope_freq_base ) ;
LLAMA_LOG_INFO ( " %s: freq_scale = %g \n " , __func__ , cparams . rope_freq_scale ) ;
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
if ( n_ctx_per_seq < hparams . n_ctx_train ) {
LLAMA_LOG_WARN ( " %s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized \n " ,
__func__ , n_ctx_per_seq , hparams . n_ctx_train ) ;
2024-07-28 00:42:05 -04:00
}
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
if ( n_ctx_per_seq > hparams . n_ctx_train ) {
LLAMA_LOG_WARN ( " %s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow \n " ,
__func__ , n_ctx_per_seq , hparams . n_ctx_train ) ;
2023-05-01 14:54:59 +03:00
}
2025-01-03 10:18:53 +02:00
ctx - > logits_all = params . logits_all ;
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
// build worst-case graph for encoder if a model contains encoder
ctx - > is_encoding = llama_model_has_encoder ( model ) ;
uint32_t kv_size = cparams . n_ctx ;
ggml_type type_k = params . type_k ;
ggml_type type_v = params . type_v ;
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
// Mamba only needs a constant number of KV cache cells per sequence
if ( llama_model_is_recurrent ( model ) ) {
// Mamba needs at least as many KV cells as there are sequences kept at any time
kv_size = std : : max ( ( uint32_t ) 1 , params . n_seq_max ) ;
// it's probably best to keep as much precision as possible for the states
type_k = GGML_TYPE_F32 ; // required by ggml_ssm_conv for Mamba's conv_states
type_v = GGML_TYPE_F32 ; // required by ggml_ssm_scan for Mamba's ssm_states
2023-05-01 14:54:59 +03:00
}
2025-01-03 10:18:53 +02:00
GGML_ASSERT ( hparams . n_embd_head_k % ggml_blck_size ( type_k ) = = 0 ) ;
GGML_ASSERT ( hparams . n_embd_head_v % ggml_blck_size ( type_v ) = = 0 ) ;
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
if ( ! hparams . vocab_only ) {
// GPU backends
for ( auto * dev : model - > devices ) {
ggml_backend_t backend = ggml_backend_dev_init ( dev , nullptr ) ;
if ( backend = = nullptr ) {
LLAMA_LOG_ERROR ( " %s: failed to initialize %s backend \n " , __func__ , ggml_backend_dev_name ( dev ) ) ;
llama_free ( ctx ) ;
return nullptr ;
}
ctx - > backends . emplace_back ( backend ) ;
}
2024-07-28 00:42:05 -04:00
2025-01-03 10:18:53 +02:00
// add ACCEL backends (such as BLAS)
for ( size_t i = 0 ; i < ggml_backend_dev_count ( ) ; + + i ) {
ggml_backend_dev_t dev = ggml_backend_dev_get ( i ) ;
if ( ggml_backend_dev_type ( dev ) = = GGML_BACKEND_DEVICE_TYPE_ACCEL ) {
ggml_backend_t backend = ggml_backend_dev_init ( dev , nullptr ) ;
if ( backend = = nullptr ) {
LLAMA_LOG_ERROR ( " %s: failed to initialize %s backend \n " , __func__ , ggml_backend_dev_name ( dev ) ) ;
llama_free ( ctx ) ;
return nullptr ;
}
ctx - > backends . emplace_back ( backend ) ;
}
2023-05-01 14:54:59 +03:00
}
2025-01-03 10:18:53 +02:00
// add CPU backend
ctx - > backend_cpu = ggml_backend_init_by_type ( GGML_BACKEND_DEVICE_TYPE_CPU , nullptr ) ;
if ( ctx - > backend_cpu = = nullptr ) {
LLAMA_LOG_ERROR ( " %s: failed to initialize CPU backend \n " , __func__ ) ;
llama_free ( ctx ) ;
return nullptr ;
}
ctx - > backends . emplace_back ( ctx - > backend_cpu ) ;
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
// create a list of the set_n_threads functions in the backends
for ( auto & backend : ctx - > backends ) {
ggml_backend_dev_t dev = ggml_backend_get_device ( backend . get ( ) ) ;
ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg ( dev ) : nullptr ;
if ( reg ) {
auto ggml_backend_set_n_threads_fn = ( ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address ( reg , " ggml_backend_set_n_threads " ) ;
if ( ggml_backend_set_n_threads_fn ) {
ctx - > set_n_threads_fns . emplace_back ( backend . get ( ) , ggml_backend_set_n_threads_fn ) ;
}
}
}
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
llama_set_abort_callback ( ctx , params . abort_callback , params . abort_callback_data ) ;
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
if ( ! llama_kv_cache_init ( ctx - > kv_self , ctx - > model , ctx - > cparams , type_k , type_v , kv_size , cparams . offload_kqv ) ) {
LLAMA_LOG_ERROR ( " %s: llama_kv_cache_init() failed for self-attention cache \n " , __func__ ) ;
llama_free ( ctx ) ;
return nullptr ;
}
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
{
size_t memory_size_k = 0 ;
size_t memory_size_v = 0 ;
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
for ( auto & k : ctx - > kv_self . k_l ) {
memory_size_k + = ggml_nbytes ( k ) ;
}
2023-05-01 14:54:59 +03:00
2025-01-03 10:18:53 +02:00
for ( auto & v : ctx - > kv_self . v_l ) {
memory_size_v + = ggml_nbytes ( v ) ;
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
LLAMA_LOG_INFO ( " %s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB \n " , __func__ ,
( float ) ( memory_size_k + memory_size_v ) / ( 1024.0f * 1024.0f ) ,
ggml_type_name ( type_k ) , ( float ) memory_size_k / ( 1024.0f * 1024.0f ) ,
ggml_type_name ( type_v ) , ( float ) memory_size_v / ( 1024.0f * 1024.0f ) ) ;
}
2024-04-25 17:59:03 +02:00
2025-01-03 10:18:53 +02:00
// graph outputs buffer
{
// resized during inference when a batch uses more outputs
if ( llama_output_reserve ( * ctx , params . n_seq_max ) < params . n_seq_max ) {
LLAMA_LOG_ERROR ( " %s: failed to reserve initial output buffer \n " , __func__ ) ;
llama_free ( ctx ) ;
return nullptr ;
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
LLAMA_LOG_INFO ( " %s: %10s output buffer size = %8.2f MiB \n " , __func__ ,
ggml_backend_buffer_name ( ctx - > buf_output . get ( ) ) ,
ggml_backend_buffer_get_size ( ctx - > buf_output . get ( ) ) / 1024.0 / 1024.0 ) ;
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// scheduler and compute buffers
{
// buffer types used for the compute buffer of each backend
std : : vector < ggml_backend_buffer_type_t > backend_buft ;
std : : vector < ggml_backend_t > backend_ptrs ;
for ( auto & backend : ctx - > backends ) {
auto * buft = ggml_backend_get_default_buffer_type ( backend . get ( ) ) ;
auto backend_type = ggml_backend_dev_type ( ggml_backend_get_device ( backend . get ( ) ) ) ;
if ( backend_type = = GGML_BACKEND_DEVICE_TYPE_CPU & & ! model - > devices . empty ( ) ) {
// use the host buffer of the first device CPU for faster transfer of the intermediate state
auto * dev = model - > devices [ 0 ] ;
auto * host_buft = ggml_backend_dev_host_buffer_type ( dev ) ;
if ( host_buft ) {
buft = host_buft ;
}
}
backend_buft . push_back ( buft ) ;
backend_ptrs . push_back ( backend . get ( ) ) ;
}
2024-04-08 20:43:30 +08:00
2025-01-12 11:32:42 +02:00
const size_t max_nodes = model - > max_nodes ( ) ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// buffer used to store the computation graph and the tensor meta data
ctx - > buf_compute_meta . resize ( ggml_tensor_overhead ( ) * max_nodes + ggml_graph_overhead_custom ( max_nodes , false ) ) ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
2025-01-12 11:32:42 +02:00
model - > n_devices ( ) > 1 & &
model - > params . n_gpu_layers > ( int ) model - > hparams . n_layer & &
model - > params . split_mode = = LLAMA_SPLIT_MODE_LAYER & &
2025-01-03 10:18:53 +02:00
params . offload_kqv ;
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
// pipeline parallelism requires support for async compute and events in all devices
if ( pipeline_parallel ) {
for ( auto & backend : ctx - > backends ) {
auto dev_type = ggml_backend_dev_type ( ggml_backend_get_device ( backend . get ( ) ) ) ;
if ( dev_type = = GGML_BACKEND_DEVICE_TYPE_CPU ) {
// ignore CPU backend
continue ;
}
auto * dev = ggml_backend_get_device ( backend . get ( ) ) ;
ggml_backend_dev_props props ;
ggml_backend_dev_get_props ( dev , & props ) ;
if ( ! props . caps . async | | ! props . caps . events ) {
// device does not support async compute or events
pipeline_parallel = false ;
break ;
}
}
}
2024-07-05 05:14:21 +12:00
2025-01-03 10:18:53 +02:00
ctx - > sched . reset ( ggml_backend_sched_new ( backend_ptrs . data ( ) , backend_buft . data ( ) , backend_ptrs . size ( ) , max_nodes , pipeline_parallel ) ) ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
if ( pipeline_parallel ) {
LLAMA_LOG_INFO ( " %s: pipeline parallelism enabled (n_copies=%d) \n " , __func__ , ggml_backend_sched_get_n_copies ( ctx - > sched . get ( ) ) ) ;
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// initialize scheduler with the worst-case graph
uint32_t n_seqs = 1 ; // TODO: worst-case number of sequences
uint32_t n_tokens = std : : min ( cparams . n_ctx , cparams . n_ubatch ) ;
2025-01-12 11:32:42 +02:00
llama_token token = ctx - > model . vocab . token_bos ( ) ; // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
llama_ubatch ubatch_pp = { true , n_tokens , n_tokens / n_seqs , n_seqs , & token , nullptr , nullptr , nullptr , nullptr , nullptr } ;
ggml_cgraph * gf_pp = llama_build_graph ( * ctx , ubatch_pp , true ) ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// reserve pp graph first so that buffers are only allocated once
ggml_backend_sched_reserve ( ctx - > sched . get ( ) , gf_pp ) ;
int n_splits_pp = ggml_backend_sched_get_n_splits ( ctx - > sched . get ( ) ) ;
int n_nodes_pp = ggml_graph_n_nodes ( gf_pp ) ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// reserve with tg graph to get the number of splits and nodes
llama_ubatch ubatch_tg = { true , 1 , 1 , n_seqs , & token , nullptr , nullptr , nullptr , nullptr , nullptr } ;
ggml_cgraph * gf_tg = llama_build_graph ( * ctx , ubatch_tg , true ) ;
ggml_backend_sched_reserve ( ctx - > sched . get ( ) , gf_tg ) ;
int n_splits_tg = ggml_backend_sched_get_n_splits ( ctx - > sched . get ( ) ) ;
int n_nodes_tg = ggml_graph_n_nodes ( gf_tg ) ;
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
gf_pp = llama_build_graph ( * ctx , ubatch_pp , true ) ;
if ( ! ggml_backend_sched_reserve ( ctx - > sched . get ( ) , gf_pp ) ) {
LLAMA_LOG_ERROR ( " %s: failed to allocate compute buffers \n " , __func__ ) ;
llama_free ( ctx ) ;
return nullptr ;
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
for ( size_t i = 0 ; i < backend_ptrs . size ( ) ; + + i ) {
ggml_backend_t backend = backend_ptrs [ i ] ;
ggml_backend_buffer_type_t buft = backend_buft [ i ] ;
size_t size = ggml_backend_sched_get_buffer_size ( ctx - > sched . get ( ) , backend ) ;
if ( size > 1 ) {
LLAMA_LOG_INFO ( " %s: %10s compute buffer size = %8.2f MiB \n " , __func__ ,
ggml_backend_buft_name ( buft ) ,
size / 1024.0 / 1024.0 ) ;
}
}
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
if ( n_nodes_pp = = n_nodes_tg ) {
LLAMA_LOG_INFO ( " %s: graph nodes = %d \n " , __func__ , n_nodes_pp ) ;
} else {
LLAMA_LOG_INFO ( " %s: graph nodes = %d (with bs=%d), %d (with bs=1) \n " , __func__ , n_nodes_pp , n_tokens , n_nodes_tg ) ;
}
if ( n_splits_pp = = n_splits_tg ) {
LLAMA_LOG_INFO ( " %s: graph splits = %d \n " , __func__ , n_splits_pp ) ;
} else {
LLAMA_LOG_INFO ( " %s: graph splits = %d (with bs=%d), %d (with bs=1) \n " , __func__ , n_splits_pp , n_tokens , n_splits_tg ) ;
}
2024-04-08 20:43:30 +08:00
}
}
2025-01-03 10:18:53 +02:00
return ctx ;
}
2024-04-08 20:43:30 +08:00
2025-01-12 11:32:42 +02:00
struct llama_context * llama_new_context_with_model (
struct llama_model * model ,
struct llama_context_params params ) {
return llama_init_from_model ( model , params ) ;
}
2025-01-03 10:18:53 +02:00
//
// kv cache
//
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
// TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API
2024-04-08 20:43:30 +08:00
2025-01-03 10:18:53 +02:00
struct llama_kv_cache_view llama_kv_cache_view_init ( const struct llama_context * ctx , int32_t n_seq_max ) {
return llama_kv_cache_view_init ( ctx - > kv_self , n_seq_max ) ;
2024-04-08 20:43:30 +08:00
}
2025-01-03 10:18:53 +02:00
void llama_kv_cache_view_update ( const struct llama_context * ctx , struct llama_kv_cache_view * view ) {
llama_kv_cache_view_update ( view , ctx - > kv_self ) ;
2024-04-08 20:43:30 +08:00
}
2025-01-03 10:18:53 +02:00
int32_t llama_get_kv_cache_token_count ( const struct llama_context * ctx ) {
return llama_get_kv_cache_token_count ( ctx - > kv_self ) ;
2024-04-08 20:43:30 +08:00
}
2025-01-03 10:18:53 +02:00
int32_t llama_get_kv_cache_used_cells ( const struct llama_context * ctx ) {
return llama_get_kv_cache_used_cells ( ctx - > kv_self ) ;
2023-09-28 21:42:38 +02:00
}
2025-01-03 10:18:53 +02:00
void llama_kv_cache_clear ( struct llama_context * ctx ) {
llama_kv_cache_clear ( ctx - > kv_self ) ;
2024-05-23 14:29:26 +02:00
}
2025-01-03 10:18:53 +02:00
bool llama_kv_cache_seq_rm ( struct llama_context * ctx , llama_seq_id seq_id , llama_pos p0 , llama_pos p1 ) {
return llama_kv_cache_seq_rm ( ctx - > kv_self , seq_id , p0 , p1 ) ;
2024-05-23 14:29:26 +02:00
}
2025-01-03 10:18:53 +02:00
void llama_kv_cache_seq_cp ( struct llama_context * ctx , llama_seq_id seq_id_src , llama_seq_id seq_id_dst , llama_pos p0 , llama_pos p1 ) {
if ( seq_id_src = = seq_id_dst ) {
return ;
2024-11-25 15:13:39 +01:00
}
2025-01-03 10:18:53 +02:00
llama_kv_cache_seq_cp ( ctx - > kv_self , seq_id_src , seq_id_dst , p0 , p1 ) ;
2024-03-02 20:52:25 +01:00
}
2025-01-03 10:18:53 +02:00
void llama_kv_cache_seq_keep ( struct llama_context * ctx , llama_seq_id seq_id ) {
llama_kv_cache_seq_keep ( ctx - > kv_self , seq_id ) ;
2024-06-21 00:38:22 -05:00
}
2025-01-03 10:18:53 +02:00
void llama_kv_cache_seq_add ( struct llama_context * ctx , llama_seq_id seq_id , llama_pos p0 , llama_pos p1 , llama_pos delta ) {
if ( delta = = 0 ) {
return ;
}
2024-03-10 11:56:30 -04:00
2025-01-03 10:18:53 +02:00
llama_kv_cache_seq_add ( ctx - > kv_self , seq_id , p0 , p1 , delta ) ;
2023-09-28 19:04:36 +03:00
}
2023-06-04 23:34:30 +03:00
2025-01-03 10:18:53 +02:00
void llama_kv_cache_seq_div ( struct llama_context * ctx , llama_seq_id seq_id , llama_pos p0 , llama_pos p1 , int d ) {
if ( d = = 1 ) {
return ;
2023-06-04 23:34:30 +03:00
}
2025-01-03 10:18:53 +02:00
llama_kv_cache_seq_div ( ctx - > kv_self , seq_id , p0 , p1 , d ) ;
}
llama_pos llama_kv_cache_seq_pos_max ( struct llama_context * ctx , llama_seq_id seq_id ) {
return llama_kv_cache_seq_pos_max ( ctx - > kv_self , seq_id ) ;
}
2024-02-01 23:20:13 -08:00
2025-01-03 10:18:53 +02:00
void llama_kv_cache_defrag ( struct llama_context * ctx ) {
llama_kv_cache_defrag ( ctx - > kv_self ) ;
}
2023-09-28 19:04:36 +03:00
2025-01-03 10:18:53 +02:00
void llama_kv_cache_update ( struct llama_context * ctx ) {
2025-01-06 10:52:01 +02:00
llama_kv_cache_update_impl ( * ctx ) ;
2023-09-28 19:04:36 +03:00
}
2025-01-03 10:18:53 +02:00
bool llama_kv_cache_can_shift ( struct llama_context * ctx ) {
return llama_kv_cache_can_shift ( ctx - > kv_self ) ;
2023-09-28 19:04:36 +03:00
}
2025-01-03 10:18:53 +02:00
///
2024-07-04 15:46:11 +02:00
int32_t llama_encode (
struct llama_context * ctx ,
struct llama_batch batch ) {
2025-01-06 10:52:01 +02:00
const int ret = llama_encode_impl ( * ctx , batch ) ;
2024-10-18 23:18:01 +02:00
if ( ret ! = 0 ) {
2024-07-04 15:46:11 +02:00
LLAMA_LOG_ERROR ( " %s: failed to encode, ret = %d \n " , __func__ , ret ) ;
}
return ret ;
}
2024-01-02 06:15:16 -08:00
int32_t llama_decode (
2023-09-28 19:04:36 +03:00
struct llama_context * ctx ,
2023-09-28 21:42:38 +02:00
struct llama_batch batch ) {
2025-01-06 10:52:01 +02:00
const int ret = llama_decode_impl ( * ctx , batch ) ;
2024-10-18 23:18:01 +02:00
if ( ret ! = 0 ) {
2023-09-28 19:04:36 +03:00
LLAMA_LOG_ERROR ( " %s: failed to decode, ret = %d \n " , __func__ , ret ) ;
}
return ret ;
2023-06-04 23:34:30 +03:00
}
2024-07-23 13:10:17 +03:00
//
// chat templates
//
2024-02-19 09:23:37 +01:00
2024-07-23 13:10:17 +03:00
int32_t llama_chat_apply_template (
2024-02-19 09:23:37 +01:00
const char * tmpl ,
const struct llama_chat_message * chat ,
size_t n_msg ,
bool add_ass ,
char * buf ,
int32_t length ) {
2025-01-12 11:32:42 +02:00
const std : : string curr_tmpl ( tmpl = = nullptr ? " chatml " : tmpl ) ;
2024-03-07 11:41:53 +02:00
2024-02-19 09:23:37 +01:00
// format the chat to string
std : : vector < const llama_chat_message * > chat_vec ;
chat_vec . resize ( n_msg ) ;
for ( size_t i = 0 ; i < n_msg ; i + + ) {
chat_vec [ i ] = & chat [ i ] ;
}
2024-03-07 11:41:53 +02:00
2024-02-19 09:23:37 +01:00
std : : string formatted_chat ;
2025-01-03 10:18:53 +02:00
llm_chat_template detected_tmpl = llm_chat_detect_template ( curr_tmpl ) ;
2024-12-02 22:10:19 +01:00
if ( detected_tmpl = = LLM_CHAT_TEMPLATE_UNKNOWN ) {
return - 1 ;
}
2025-01-03 10:18:53 +02:00
int32_t res = llm_chat_apply_template ( detected_tmpl , chat_vec , formatted_chat , add_ass ) ;
2024-02-19 09:23:37 +01:00
if ( res < 0 ) {
return res ;
}
2024-03-07 11:41:53 +02:00
if ( buf & & length > 0 ) {
strncpy ( buf , formatted_chat . c_str ( ) , length ) ;
}
2024-02-19 09:23:37 +01:00
return res ;
}
2024-07-23 13:10:17 +03:00
//
2024-09-07 15:16:19 +03:00
// model split
2024-07-23 13:10:17 +03:00
//
int llama_split_path ( char * split_path , size_t maxlen , const char * path_prefix , int split_no , int split_count ) {
2024-03-22 19:00:01 +01:00
static const char * const SPLIT_PATH_FORMAT = " %s-%05d-of-%05d.gguf " ;
if ( snprintf ( split_path , maxlen , SPLIT_PATH_FORMAT , path_prefix , split_no + 1 , split_count ) ) {
return strlen ( split_path ) ;
}
return 0 ;
}
2025-01-06 17:52:35 +02:00
int llama_split_prefix ( char * split_prefix , size_t maxlen , const char * split_path , int split_no , int split_count ) {
2024-03-22 19:00:01 +01:00
std : : string str_split_path ( split_path ) ;
char postfix [ 32 ] ;
snprintf ( postfix , 32 , " -%05d-of-%05d.gguf " , split_no + 1 , split_count ) ;
std : : string str_postfix ( postfix ) ;
2025-01-06 17:52:35 +02:00
// check if split_prefix ends with postfix
2024-03-22 19:00:01 +01:00
int size_prefix = str_split_path . size ( ) - str_postfix . size ( ) ;
if ( size_prefix > 0 & & str_split_path . find ( str_postfix , size_prefix ) ! = std : : string : : npos ) {
2025-01-06 17:52:35 +02:00
snprintf ( split_prefix , std : : min ( ( size_t ) size_prefix + 1 , maxlen ) , " %s " , split_path ) ;
2024-03-22 19:00:01 +01:00
return size_prefix ;
}
return 0 ;
}
2023-03-22 07:32:36 +02:00
const char * llama_print_system_info ( void ) {
static std : : string s ;
2025-01-06 12:21:46 +01:00
s . clear ( ) ; // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
2023-03-22 07:32:36 +02:00
2024-11-25 15:13:39 +01:00
for ( size_t i = 0 ; i < ggml_backend_reg_count ( ) ; i + + ) {
auto * reg = ggml_backend_reg_get ( i ) ;
auto * get_features_fn = ( ggml_backend_get_features_t ) ggml_backend_reg_get_proc_address ( reg , " ggml_backend_get_features " ) ;
if ( get_features_fn ) {
ggml_backend_feature * features = get_features_fn ( reg ) ;
s + = ggml_backend_reg_name ( reg ) ;
s + = " : " ;
for ( ; features - > name ; features + + ) {
s + = features - > name ;
s + = " = " ;
s + = features - > value ;
s + = " | " ;
}
}
}
2023-03-22 07:32:36 +02:00
return s . c_str ( ) ;
}
2023-04-08 00:09:18 +02:00
2025-01-03 10:18:53 +02:00
//
// perf
//
2024-09-13 09:53:38 +03:00
struct llama_perf_context_data llama_perf_context ( const struct llama_context * ctx ) {
struct llama_perf_context_data data = { } ;
2024-09-07 15:16:19 +03:00
2024-09-13 09:53:38 +03:00
if ( ctx = = nullptr ) {
return data ;
}
2024-09-07 15:16:19 +03:00
2024-09-13 09:53:38 +03:00
data . t_start_ms = 1e-3 * ctx - > t_start_us ;
data . t_load_ms = 1e-3 * ctx - > t_load_us ;
data . t_p_eval_ms = 1e-3 * ctx - > t_p_eval_us ;
data . t_eval_ms = 1e-3 * ctx - > t_eval_us ;
data . n_p_eval = std : : max ( 1 , ctx - > n_p_eval ) ;
data . n_eval = std : : max ( 1 , ctx - > n_eval ) ;
2024-09-07 15:16:19 +03:00
2024-09-13 09:53:38 +03:00
return data ;
2024-09-07 15:16:19 +03:00
}
2024-09-13 09:53:38 +03:00
void llama_perf_context_print ( const struct llama_context * ctx ) {
const auto data = llama_perf_context ( ctx ) ;
2024-09-07 15:16:19 +03:00
2024-09-13 09:53:38 +03:00
const double t_end_ms = 1e-3 * ggml_time_us ( ) ;
2024-09-07 15:16:19 +03:00
2024-09-13 09:53:38 +03:00
LLAMA_LOG_INFO ( " %s: load time = %10.2f ms \n " , __func__ , data . t_load_ms ) ;
LLAMA_LOG_INFO ( " %s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second) \n " ,
__func__ , data . t_p_eval_ms , data . n_p_eval , data . t_p_eval_ms / data . n_p_eval , 1e3 / data . t_p_eval_ms * data . n_p_eval ) ;
LLAMA_LOG_INFO ( " %s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second) \n " ,
__func__ , data . t_eval_ms , data . n_eval , data . t_eval_ms / data . n_eval , 1e3 / data . t_eval_ms * data . n_eval ) ;
LLAMA_LOG_INFO ( " %s: total time = %10.2f ms / %5d tokens \n " , __func__ , ( t_end_ms - data . t_start_ms ) , ( data . n_p_eval + data . n_eval ) ) ;
}
void llama_perf_context_reset ( struct llama_context * ctx ) {
ctx - > t_start_us = ggml_time_us ( ) ;
ctx - > t_eval_us = ctx - > n_eval = 0 ;
ctx - > t_p_eval_us = ctx - > n_p_eval = 0 ;
2024-09-07 15:16:19 +03:00
}