kv-cache : refactor the update/defrag mechanism (#13988)

* kv-cache : refactor update mechanism

ggml-ci

* memory : improve status handling

* defrag : reset head + add comments

ggml-ci

* cont : minor fixes

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-06-04 18:58:20 +03:00 committed by GitHub
parent 2589ad3704
commit 3e63a58ef7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 340 additions and 191 deletions

View file

@ -1,6 +1,7 @@
#include "llama-kv-cache-recurrent.h"
#include "llama-impl.h"
#include "llama-io.h"
#include "llama-batch.h"
#include "llama-model.h"
@ -386,6 +387,13 @@ llama_memory_state_ptr llama_kv_cache_recurrent::init_full() {
return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
}
llama_memory_state_ptr llama_kv_cache_recurrent::init_update(llama_context * lctx, bool optimize) {
GGML_UNUSED(lctx);
GGML_UNUSED(optimize);
return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_NO_UPDATE);
}
bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
// simply remember the full state because it is very small for this type of cache
// TODO: optimize
@ -419,17 +427,6 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
return success;
}
bool llama_kv_cache_recurrent::update(llama_context & lctx) {
GGML_UNUSED(lctx);
// noop
return false;
}
void llama_kv_cache_recurrent::defrag_sched(float thold) {
GGML_UNUSED(thold);
// noop
}
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
const uint32_t n_tokens = ubatch.n_tokens;
const uint32_t n_seqs = ubatch.n_seqs;