llama : deprecate llama_kv_self_ API (#14030)

* llama : deprecate llama_kv_self_ API

ggml-ci

* llama : allow llama_memory_(nullptr)

ggml-ci

* memory : add flag for optional data clear in llama_memory_clear

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-06-06 14:11:15 +03:00 committed by GitHub
parent 487a5e0401
commit 745aa5319b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 206 additions and 127 deletions

View file

@ -142,6 +142,8 @@ int main(int argc, char ** argv) {
}
}
auto * mem_tgt = llama_get_memory(ctx_tgt);
auto * mem_dft = llama_get_memory(ctx_dft);
// Tokenize the prompt
std::vector<llama_token> inp;
@ -420,14 +422,14 @@ int main(int argc, char ** argv) {
{
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_kv_self_seq_keep(ctx_dft, s_keep);
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_dft, 0);
llama_memory_seq_keep(mem_dft, s_keep);
llama_memory_seq_cp (mem_dft, s_keep, 0, -1, -1);
llama_memory_seq_keep(mem_dft, 0);
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
llama_kv_self_seq_keep(ctx_tgt, s_keep);
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_tgt, 0);
llama_memory_seq_rm (mem_tgt, s_keep, n_past_tgt, -1);
llama_memory_seq_keep(mem_tgt, s_keep);
llama_memory_seq_cp (mem_tgt, s_keep, 0, -1, -1);
llama_memory_seq_keep(mem_tgt, 0);
}
for (int s = 0; s < n_seq_dft; ++s) {
@ -444,7 +446,7 @@ int main(int argc, char ** argv) {
common_batch_clear(batch_dft);
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1);
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode(ctx_dft, batch_dft);
@ -503,8 +505,8 @@ int main(int argc, char ** argv) {
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
llama_memory_seq_rm(mem_dft, n_seq_cur, -1, -1);
llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1);
// all previous tokens from this branch are now also part of the new branch
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@ -585,9 +587,9 @@ int main(int argc, char ** argv) {
// evaluate the target model on the drafted tokens
{
llama_kv_self_seq_keep(ctx_tgt, 0);
llama_memory_seq_keep(mem_tgt, 0);
for (int s = 1; s < n_seq_dft; ++s) {
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
llama_memory_seq_cp(mem_tgt, 0, s, -1, -1);
}
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());