kv-cache : refactor + add llama_memory_state_i (#13746)

* kv-cache : simplify the "struct llama_kv_cache" interface

ggml-ci

* kv-cache : revert the (n_swa + n_ubatch) change (for next PR)

ggml-ci

* kv-cache : some comments

ggml-ci

* context : fix graph reserve for multiple sequences

ggml-ci

* kv-cache : fix typo [no ci]

* kv-cache : fix find_slot() logic for free slots

ggml-ci

* llama : add TODO for deprecating the defrag API in the future

* kv-cache : improve find_slot() using min/max seq pos info

ggml-ci

* llama : handle aborts and compute errors

ggml-ci

* memory : extract state into llama_memory_state

ggml-ci

* kv-cache : add comments

ggml-ci

* server : update batching logic to reset n_batch on successful decode

* server : upon full re-processing, remove the sequence from the cache

* kv-cache : add TODO for doing split_equal when split_simple fails

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-05-31 10:24:04 +03:00 committed by GitHub
parent eb3949938e
commit 12d0188c0d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1304 additions and 655 deletions

View file

@ -2,6 +2,11 @@
#include "llama.h"
#include <memory>
#include <vector>
struct llama_ubatch;
struct llama_memory_params {
// kv cache
ggml_type type_k;
@ -30,3 +35,42 @@ public:
virtual bool get_can_edit() const = 0;
};
enum llama_memory_status {
LLAMA_MEMORY_STATUS_SUCCESS = 0,
LLAMA_MEMORY_STATUS_FAILED_PREPARE,
LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
};
// the interface for managing the memory state during batch processing
// this interface is implemented per memory type. see:
// - llama_kv_cache_unified_state
// - llama_kv_cache_unified_iswa_state
// ...
//
// the only method that can mutate the memory and the memory state is llama_memory_i::apply()
//
// TODO: rename to llama_memory_context_i ?
class llama_memory_state_i {
public:
virtual ~llama_memory_state_i() = default;
// consume the current ubatch from the state and proceed to the next one
// return false if we are done
virtual bool next() = 0;
// apply the memory state for the current ubatch to the memory object
// return false on failure
virtual bool apply() = 0;
// TODO: this might get reworked in the future when refactoring llama_batch
virtual std::vector<int64_t> & out_ids() = 0;
// get the current ubatch
virtual const llama_ubatch & get_ubatch() const = 0;
// get the status of the memory state
virtual llama_memory_status get_status() const = 0;
};
using llama_memory_state_ptr = std::unique_ptr<llama_memory_state_i>;