kv-cache : separate recurrent vs non-recurrent impl (#12799)
* kv-cache : serparate recurrent vs non-recurrent impl (wip) ggml-ci * kv-cache : init -> contructor + add llama_memory_params ggml-ci * kv-cache : fix callback reference ggml-ci * context : llama_kv_cache -> llama_memory_i ggml-ci * context : move memory creation logic to model ggml-ci * llama : remove reference of memory during encode ggml-ci * kv-cache : hide padding details in the implementation ggml-ci * kv-cache : add ubatch_next() ggml-ci * context : simplify sbatch logic ggml-ci * kv-cache : hide defrag logic in the implementation ggml-ci * context : hide kv cache details in implementation ggml-ci * build : fix ggml-ci * cont : another fix ggml-ci * kv-cache : simplify interface (wip) ggml-ci * kv-cache : use separate KV cell structs for unified/recurrent ggml-ci * kv-cache : clean-up ggml-ci * model : better llama_model::create_model() signature ggml-ci * kv-cache : fix recurrent seq_rm() ggml-ci * kv-cache : replace `struct callbacks` with `llama_model &` ggml-ci * kv-cache : replace `struct graph_params` with `llama_context &` ggml-ci * kv-cache : fix offload check ggml-ci * context : avoid passing unique_ptr ggml-ci * kv-cache : avoid using the backends from the llama_context ref #13113 ggml-ci * kv-cache : more consistent debug logs [no ci] * kv-cache : do not pass the full llama_context for kv graphs ggml-ci * kv-cache : remove comment * kv-cache : ggml_rope_ext_inplace -> ggml_rope_ext ggml-ci * kv-cache : fix recurrent multi-user case ggml-ci * memory : remove comments [no ci]
This commit is contained in:
parent
cb06a3c363
commit
c642bc014c
11 changed files with 1960 additions and 1048 deletions
|
@ -6,11 +6,9 @@
|
|||
#include "llama-model.h"
|
||||
#include "llama-kv-cache.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <stdexcept>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
|
||||
//
|
||||
// llama_context
|
||||
|
@ -177,44 +175,13 @@ llama_context::llama_context(
|
|||
}
|
||||
|
||||
// init the memory module
|
||||
// TODO: for now, always create a unified KV cache
|
||||
if (!hparams.vocab_only) {
|
||||
kv_self.reset(static_cast<llama_kv_cache_unified *>(model.create_memory()));
|
||||
llama_memory_params params_mem = {
|
||||
/*.type_k =*/ params.type_k,
|
||||
/*.type_v =*/ params.type_v,
|
||||
};
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams));
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
uint32_t kv_size = cparams.n_ctx;
|
||||
ggml_type type_k = params.type_k;
|
||||
ggml_type type_v = params.type_v;
|
||||
|
||||
if (llama_model_is_recurrent(&model)) {
|
||||
// Mamba needs at least as many KV cells as there are sequences kept at any time
|
||||
kv_size = std::max((uint32_t) 1, params.n_seq_max);
|
||||
// it's probably best to keep as much precision as possible for the states
|
||||
type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
|
||||
type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
|
||||
}
|
||||
|
||||
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||
|
||||
if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
||||
throw std::runtime_error("failed to initialize self-attention cache");
|
||||
}
|
||||
|
||||
{
|
||||
const size_t memory_size_k = kv_self->size_k_bytes();
|
||||
const size_t memory_size_v = kv_self->size_v_bytes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
memory.reset(model.create_memory(params_mem, cparams));
|
||||
}
|
||||
|
||||
// init backends
|
||||
|
@ -305,7 +272,9 @@ llama_context::llama_context(
|
|||
int n_nodes_tg = -1;
|
||||
|
||||
// simulate full KV cache
|
||||
kv_self->n = kv_self->size;
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->set_full();
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
|
@ -427,6 +396,18 @@ const llama_model & llama_context::get_model() const {
|
|||
return model;
|
||||
}
|
||||
|
||||
const llama_cparams & llama_context::get_cparams() const {
|
||||
return cparams;
|
||||
}
|
||||
|
||||
ggml_backend_sched_t llama_context::get_sched() const {
|
||||
return sched.get();
|
||||
}
|
||||
|
||||
ggml_context * llama_context::get_ctx_compute() const {
|
||||
return ctx_compute.get();
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_ctx() const {
|
||||
return cparams.n_ctx;
|
||||
}
|
||||
|
@ -456,337 +437,21 @@ uint32_t llama_context::n_threads_batch() const {
|
|||
}
|
||||
|
||||
llama_kv_cache * llama_context::get_kv_self() {
|
||||
return kv_self.get();
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
return kv_self;
|
||||
}
|
||||
|
||||
const llama_kv_cache * llama_context::get_kv_self() const {
|
||||
return kv_self.get();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_context::build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & rope_type = hparams.rope_type;
|
||||
|
||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
|
||||
|
||||
ggml_tensor * tmp;
|
||||
|
||||
if (ggml_is_quantized(cur->type)) {
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
|
||||
tmp = ggml_rope_ext(ctx0, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
tmp = ggml_cpy(ctx0, tmp, cur);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
tmp = ggml_rope_ext_inplace(ctx0, cur,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
class llm_graph_input_k_shift : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||
virtual ~llm_graph_input_k_shift() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * k_shift; // I32 [kv_size]
|
||||
|
||||
const llama_kv_cache_unified * kv_self;
|
||||
};
|
||||
|
||||
void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
if (k_shift) {
|
||||
assert(ggml_backend_buffer_is_host(k_shift->buffer));
|
||||
|
||||
int32_t * data = (int32_t *) k_shift->data;
|
||||
|
||||
for (uint32_t i = 0; i < kv_self->size; ++i) {
|
||||
data[i] = kv_self->cells[i].delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & n_layer = hparams.n_layer;
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
//GGML_ASSERT(kv_self->size == n_ctx);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(kv_self.get());
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_ctx);
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
|
||||
// note: the swa rope params could become part of the cparams in the future
|
||||
// if we decide to make them configurable, like the non-sliding ones
|
||||
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||
|
||||
ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
|
||||
|
||||
ggml_tensor * k =
|
||||
ggml_view_3d(ctx0, kv_self->k_l[il],
|
||||
n_embd_head_k, n_head_kv, kv_self->size,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & ids = kv_self->defrag_info.ids;
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
// TODO: optimizations are possible:
|
||||
// - multiple threads
|
||||
// - avoid copying to the host memory when already there
|
||||
//
|
||||
// likely not worth the effort, as we have ggml_graph based defrag
|
||||
//
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
const uint32_t kv_size = size;
|
||||
|
||||
std::vector<uint8_t> buf_k;
|
||||
std::vector<uint8_t> buf_v;
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
|
||||
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
|
||||
|
||||
const size_t v_size_el = ggml_type_size(v_l[il]->type);
|
||||
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
|
||||
|
||||
buf_k.resize(k_size);
|
||||
buf_v.resize(v_size);
|
||||
|
||||
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
|
||||
// batch move [i, i+nm) to [id, id+nm)
|
||||
// note: cells can move only to a lower index
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
// move keys
|
||||
{
|
||||
const int64_t os = i*k_size_row;
|
||||
const int64_t od = id*k_size_row;
|
||||
|
||||
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
||||
}
|
||||
|
||||
// move values (note: they are transposed)
|
||||
{
|
||||
const int64_t os = i;
|
||||
const int64_t od = id;
|
||||
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
||||
}
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == ids.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
ggml_row_size(kv_self->v_l[il]->type, i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
|
||||
ggml_row_size(kv_self->v_l[il]->type, id));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
|
||||
return res;
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
return kv_self;
|
||||
}
|
||||
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
bool need_reserve = false;
|
||||
|
||||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
}
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
|
||||
|
||||
// apply K-shift if needed
|
||||
if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
auto * gf = graph_init();
|
||||
|
||||
auto res = build_kv_self_shift(ctx_compute.get(), gf);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
|
||||
{
|
||||
kv->has_shift = false;
|
||||
|
||||
for (uint32_t i = 0; i < kv->size; ++i) {
|
||||
kv->cells[i].delta = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
|
||||
if (kv->defrag_prepare(graph_max_nodes())) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
auto * gf = graph_init();
|
||||
|
||||
auto res = build_kv_self_defrag(ctx_compute.get(), gf);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
|
||||
kv->do_defrag = false;
|
||||
}
|
||||
need_reserve = kv_self->update(*this);
|
||||
|
||||
// reserve a worst case graph if needed
|
||||
if (need_reserve) {
|
||||
|
@ -797,7 +462,7 @@ void llama_context::kv_self_update() {
|
|||
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// simulate full KV cache
|
||||
kv_self->n = kv_self->size;
|
||||
kv_self->set_full();
|
||||
|
||||
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
|
@ -818,9 +483,6 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
|||
}
|
||||
|
||||
float * llama_context::get_logits() {
|
||||
// reorder logits for backward compatibility
|
||||
output_reorder();
|
||||
|
||||
return logits;
|
||||
}
|
||||
|
||||
|
@ -863,9 +525,6 @@ float * llama_context::get_logits_ith(int32_t i) {
|
|||
}
|
||||
|
||||
float * llama_context::get_embeddings() {
|
||||
// reorder embeddings for backward compatibility
|
||||
output_reorder();
|
||||
|
||||
return embd;
|
||||
}
|
||||
|
||||
|
@ -1017,8 +676,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||
}
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
const int32_t n_tokens = batch.n_tokens;
|
||||
|
@ -1047,7 +706,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
|
@ -1181,9 +840,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
|
||||
// TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
|
@ -1195,7 +856,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
llama_kv_cache_guard kv_guard(kv_self.get());
|
||||
llama_kv_cache_guard kv_guard(kv_self);
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
|
@ -1236,11 +897,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
n_outputs_all = 1;
|
||||
}
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
sbatch.from_batch(batch, n_embd,
|
||||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
|
@ -1254,22 +911,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
int64_t n_outputs_prev = 0;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
llama_ubatch ubatch = llama_ubatch();
|
||||
|
||||
const auto & n_ubatch = cparams.n_ubatch;
|
||||
|
||||
if (kv_self->recurrent) {
|
||||
if (embd_pooled) {
|
||||
// Pooled embeddings cannot be split across ubatches (yet)
|
||||
ubatch = sbatch.split_seq(cparams.n_ubatch);
|
||||
} else {
|
||||
// recurrent model architectures are easier to implement
|
||||
// with equal-length sequences
|
||||
ubatch = sbatch.split_equal(cparams.n_ubatch);
|
||||
}
|
||||
} else {
|
||||
ubatch = sbatch.split_simple(n_ubatch);
|
||||
}
|
||||
llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
|
||||
|
||||
// count the outputs in this u_batch
|
||||
{
|
||||
|
@ -1289,24 +931,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
}
|
||||
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
const uint32_t pad = kv_self->get_padding(cparams);
|
||||
kv_self->n = std::min(kv_self->size, std::max(pad, GGML_PAD(kv_self->cell_max(), pad)));
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
|
@ -1424,18 +1054,52 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
{
|
||||
bool sorted_output = true;
|
||||
|
||||
GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
|
||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
|
||||
|
||||
for (int64_t i = 0; i < n_outputs_all; ++i) {
|
||||
int64_t out_id = sbatch.out_ids[i];
|
||||
int64_t out_id = out_ids[i];
|
||||
output_ids[out_id] = i;
|
||||
if (out_id != i) {
|
||||
sorted_output = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sorted_output) {
|
||||
sbatch.out_ids.clear();
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
output_ids[out_ids[i]] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1446,17 +1110,8 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
//synchronize();
|
||||
|
||||
// decide if we need to defrag the kv cache
|
||||
if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f;
|
||||
|
||||
// queue defragmentation for next llama_kv_cache_update
|
||||
if (fragmentation > cparams.defrag_thold) {
|
||||
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
||||
|
||||
kv_self->defrag();
|
||||
}
|
||||
if (cparams.defrag_thold > 0.0f) {
|
||||
kv_self->defrag_sched(cparams.defrag_thold);
|
||||
}
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
|
@ -1542,44 +1197,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
output_ids[out_ids[i]] = i;
|
||||
}
|
||||
out_ids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
@ -1616,7 +1233,7 @@ llm_graph_result_ptr llama_context::graph_build(
|
|||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.memory =*/ kv_self.get(),
|
||||
/*.memory =*/ memory.get(),
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
|
@ -2020,8 +1637,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
|||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
|
||||
|
||||
output_reorder();
|
||||
|
||||
const auto n_outputs = this->n_outputs;
|
||||
const auto & output_ids = this->output_ids;
|
||||
|
||||
|
@ -2075,6 +1690,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
|||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_write(io);
|
||||
|
||||
return io.n_bytes();
|
||||
|
@ -2159,6 +1776,8 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
|||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_read(io);
|
||||
|
||||
return io.n_bytes();
|
||||
|
@ -2167,6 +1786,8 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
|||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_write(io, seq_id);
|
||||
|
||||
return io.n_bytes();
|
||||
|
@ -2175,6 +1796,8 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
|
|||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
kv_self->state_read(io, seq_id);
|
||||
|
||||
return io.n_bytes();
|
||||
|
@ -2530,7 +2153,7 @@ void llama_kv_cache_seq_cp(
|
|||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1) {
|
||||
return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
|
||||
llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_cp(
|
||||
|
@ -2544,14 +2167,14 @@ void llama_kv_self_seq_cp(
|
|||
return;
|
||||
}
|
||||
|
||||
return kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
void llama_kv_cache_seq_keep(
|
||||
llama_context * ctx,
|
||||
llama_seq_id seq_id) {
|
||||
return llama_kv_self_seq_keep(ctx, seq_id);
|
||||
llama_kv_self_seq_keep(ctx, seq_id);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
||||
|
@ -2560,7 +2183,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
|||
return;
|
||||
}
|
||||
|
||||
return kv->seq_keep(seq_id);
|
||||
kv->seq_keep(seq_id);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
|
@ -2570,7 +2193,7 @@ void llama_kv_cache_seq_add(
|
|||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta) {
|
||||
return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
|
||||
llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_add(
|
||||
|
@ -2584,7 +2207,7 @@ void llama_kv_self_seq_add(
|
|||
return;
|
||||
}
|
||||
|
||||
return kv->seq_add(seq_id, p0, p1, delta);
|
||||
kv->seq_add(seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
|
@ -2594,7 +2217,7 @@ void llama_kv_cache_seq_div(
|
|||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d) {
|
||||
return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
|
||||
llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
void llama_kv_self_seq_div(
|
||||
|
@ -2608,7 +2231,7 @@ void llama_kv_self_seq_div(
|
|||
return;
|
||||
}
|
||||
|
||||
return kv->seq_div(seq_id, p0, p1, d);
|
||||
kv->seq_div(seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
|
@ -2627,7 +2250,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
|||
|
||||
// deprecated
|
||||
void llama_kv_cache_defrag(llama_context * ctx) {
|
||||
return llama_kv_self_defrag(ctx);
|
||||
llama_kv_self_defrag(ctx);
|
||||
}
|
||||
|
||||
void llama_kv_self_defrag(llama_context * ctx) {
|
||||
|
@ -2636,7 +2259,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
|
|||
return;
|
||||
}
|
||||
|
||||
return kv->defrag();
|
||||
// force defrag
|
||||
kv->defrag_sched(-1.0f);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue