
* kv-cache : prepare for SWA ggml-ci * kv-cache : initial iSWA implementation ggml-ci * kv-cache : rework error recovery logic ggml-ci * models : fix Phi-3 SWA parameters ggml-ci * model : adjust Granite to rope factor changes ggml-ci * server : check if context can do shifts ggml-ci * iswa : for now, always enable shifts (experiment) ggml-ci * kv-cache : simplify SWA logic ggml-ci * kv-cache : apply defrag when we fail to find slots for the batch ggml-ci * llama : update docs about llama_decode ggml-ci * kv-cache : update warning logs when no space for the batch is available ggml-ci * llama : add llama_kv_self_seq_pos_min() * kv-cache : keep track of partial SWA computes and print warnings * server : disallow use cases involving partial SWA context ggml-ci * llama : add param to control SWA cache size ggml-ci * minor : clean-up ggml-ci
32 lines
1.1 KiB
C++
32 lines
1.1 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
|
|
struct llama_memory_params {
|
|
// kv cache
|
|
ggml_type type_k;
|
|
ggml_type type_v;
|
|
|
|
// use full-size SWA cache
|
|
bool swa_full;
|
|
};
|
|
|
|
// general concept of LLM memory
|
|
// the KV cache is a type of LLM memory, but there can be other types
|
|
class llama_memory_i {
|
|
public:
|
|
virtual ~llama_memory_i() = default;
|
|
|
|
virtual void clear() = 0;
|
|
|
|
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
|
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
|
virtual void seq_keep(llama_seq_id seq_id) = 0;
|
|
virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
|
|
virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
|
|
|
|
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
|
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
|
|
|
virtual bool get_can_edit() const = 0;
|
|
};
|