llama : reuse compute graphs (#14482)
* llama : reuse compute graphs ggml-ci * llama-bench : add graph reuse parameter ggml-ci * cont : remove the parameter and the sched resets ggml-ci * graph : rename update() to can_reuse() ggml-ci * params : remove is_same() ggml-ci * graph : set res->params in llm_graph_context constructor ggml-ci * graph : avoid set_max_nodes in llm_graph_result ggml-ci * kv-cache : reuse llama_context's graph result instance ggml-ci * context : reset the previous graph result upon memory updates ggml-ci * batch : llama_ubatch now carries its data instead of pointing to balloc ggml-ci * merge : fix build ggml-ci * graph : fix can_reuse() checks when flash-attention is disabled * graph : move llm_graph_result impl in source file + debug env ggml-ci
This commit is contained in:
@@ -8,12 +8,17 @@
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <bitset>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
// keep this struct lightweight
|
||||
// it points to data in `llama_batch_allocr`
|
||||
struct llama_ubatch {
|
||||
bool equal_seqs;
|
||||
bool equal_seqs() const {
|
||||
return b_equal_seqs != 0;
|
||||
}
|
||||
|
||||
uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
|
||||
// otherwise address sanitizer complains
|
||||
// TODO: whole_seqs for embeddings?
|
||||
|
||||
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
|
||||
@@ -34,6 +39,20 @@ struct llama_ubatch {
|
||||
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
||||
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
|
||||
int8_t * output; // [n_tokens] | i | -
|
||||
|
||||
struct data_t {
|
||||
std::vector<llama_token> token;
|
||||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
};
|
||||
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
||||
std::shared_ptr<data_t> data;
|
||||
};
|
||||
|
||||
// a helper for sanitizing, fulfilling and splitting a batch
|
||||
@@ -137,20 +156,5 @@ private:
|
||||
// used[i] indicates if token i has already been used in a previous ubatch
|
||||
std::vector<bool> used;
|
||||
|
||||
// llama_ubatch points to this data:
|
||||
struct ubatch {
|
||||
std::vector<llama_token> token;
|
||||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
};
|
||||
|
||||
// current splitting state:
|
||||
std::vector<ubatch> ubatches;
|
||||
|
||||
int debug;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user