kv-cache : add SWA support (#13194)
* kv-cache : prepare for SWA ggml-ci * kv-cache : initial iSWA implementation ggml-ci * kv-cache : rework error recovery logic ggml-ci * models : fix Phi-3 SWA parameters ggml-ci * model : adjust Granite to rope factor changes ggml-ci * server : check if context can do shifts ggml-ci * iswa : for now, always enable shifts (experiment) ggml-ci * kv-cache : simplify SWA logic ggml-ci * kv-cache : apply defrag when we fail to find slots for the batch ggml-ci * llama : update docs about llama_decode ggml-ci * kv-cache : update warning logs when no space for the batch is available ggml-ci * llama : add llama_kv_self_seq_pos_min() * kv-cache : keep track of partial SWA computes and print warnings * server : disallow use cases involving partial SWA context ggml-ci * llama : add param to control SWA cache size ggml-ci * minor : clean-up ggml-ci
This commit is contained in:
parent
f0adb80bf7
commit
e298d2fbd0
15 changed files with 1426 additions and 650 deletions
|
@ -2004,6 +2004,23 @@ struct server_context {
|
|||
}
|
||||
}
|
||||
|
||||
if (!llama_kv_self_can_shift(ctx)) {
|
||||
if (params_base.ctx_shift) {
|
||||
params_base.ctx_shift = false;
|
||||
SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
|
||||
}
|
||||
|
||||
if (params_base.n_cache_reuse) {
|
||||
params_base.n_cache_reuse = 0;
|
||||
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
||||
}
|
||||
|
||||
if (!params_base.speculative.model.path.empty()) {
|
||||
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -3181,7 +3198,15 @@ struct server_context {
|
|||
// if we don't cache the prompt, we have to remove the entire KV cache
|
||||
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
||||
slot.n_past = 0;
|
||||
slot.cache_tokens.clear();
|
||||
slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
|
||||
}
|
||||
|
||||
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
|
||||
if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
|
||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
||||
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
||||
slot.n_past = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue