llama : deprecate llama_kv_self_ API (#14030)

* llama : deprecate llama_kv_self_ API ggml-ci * llama : allow llama_memory_(nullptr) ggml-ci * memory : add flag for optional data clear in llama_memory_clear ggml-ci
2025-06-06 14:11:15 +03:00 · 2025-06-06 14:11:15 +03:00 · 745aa5319b
commit 745aa5319b
parent 487a5e0401
34 changed files with 206 additions and 127 deletions
--- a/tools/batched-bench/batched-bench.cpp
+++ b/tools/batched-bench/batched-bench.cpp
@ -57,6 +57,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    auto * mem = llama_get_memory(ctx);
+
    const int32_t n_kv_max = llama_n_ctx(ctx);

    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@ -132,7 +134,7 @@ int main(int argc, char ** argv) {

                const auto t_pp_start = ggml_time_us();

-                llama_kv_self_clear(ctx);
+                llama_memory_clear(mem, false);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_ERR("%s: llama_decode() failed\n", __func__);
@ -141,7 +143,7 @@ int main(int argc, char ** argv) {

                if (is_pp_shared) {
                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
+                        llama_memory_seq_cp(mem, 0, i, -1, -1);
                    }
                }

--- a/tools/cvector-generator/cvector-generator.cpp
+++ b/tools/cvector-generator/cvector-generator.cpp
@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }

 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_self_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), true);
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@ -498,7 +498,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        llama_batch batch = llama_batch_init(n_batch, 0, 1);

--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@ -1900,7 +1900,7 @@ int main(int argc, char ** argv) {

        test t(inst, lmodel, ctx);

-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), false);

        // cool off before the test
        if (params.delay) {
@ -1948,7 +1948,7 @@ int main(int argc, char ** argv) {
        }

        for (int i = 0; i < params.reps; i++) {
-            llama_kv_self_clear(ctx);
+            llama_memory_clear(llama_get_memory(ctx), false);

            if (t.n_depth > 0) {
                if (params.progress) {
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@ -147,6 +147,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    auto * mem = llama_get_memory(ctx);
+
    const llama_vocab * vocab = llama_model_get_vocab(model);
    auto chat_templates = common_chat_templates_init(model, params.chat_template);

@ -351,7 +353,7 @@ int main(int argc, char ** argv) {
        }

        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1);
    }

    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@ -599,8 +601,8 @@ int main(int argc, char ** argv) {
                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                            n_past, n_left, n_ctx, params.n_keep, n_discard);

-                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_memory_seq_rm (mem, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard);

                    n_past -= n_discard;

@ -623,9 +625,9 @@ int main(int argc, char ** argv) {
                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);

-                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_memory_seq_add(mem, 0, ga_i,                n_past,              ib*bd);
+                    llama_memory_seq_div(mem, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);

                    n_past -= bd;

--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -342,7 +342,7 @@ int main(int argc, char ** argv) {
            }
            if (line == "/clear") {
                ctx.n_past = 0;
-                llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
+                llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
                LOG("Chat history cleared\n\n");
                continue;
            }
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        llama_batch batch = llama_batch_init(n_batch, 0, 1);

@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
            return;
        }

-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@ -1217,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
            return;
        }

-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@ -1592,7 +1592,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
            return;
        }

-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@ -1782,7 +1782,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        }

        // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);

        llama_batch batch = llama_batch_init(n_batch, 0, 1);

--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
@ -939,7 +939,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                           std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
+    const bool is_first = llama_memory_seq_pos_max(llama_get_memory(llama_data.context.get()), 0) == 0;

    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
    prompt_tokens.resize(n_prompt_tokens);
@ -955,7 +955,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
    const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
+    const int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx.get()), 0);
    if (n_ctx_used + batch.n_tokens > n_ctx) {
        printf(LOG_COL_DEFAULT "\n");
        printe("context size exceeded\n");
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -2006,7 +2006,7 @@ struct server_context {
            }
        }

-        if (!llama_kv_self_can_shift(ctx)) {
+        if (!llama_memory_can_shift(llama_get_memory(ctx))) {
            if (params_base.ctx_shift) {
                params_base.ctx_shift = false;
                SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
@ -2224,7 +2224,7 @@ struct server_context {
        SRV_DBG("%s", "clearing KV cache\n");

        // clear the entire KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
        clean_kv_cache = false;
    }

@ -2910,7 +2910,7 @@ struct server_context {

                    // Erase token cache
                    const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
+                    llama_memory_seq_rm(llama_get_memory(ctx), slot->id, -1, -1);
                    slot->cache_tokens.clear();

                    auto res = std::make_unique<server_task_result_slot_erase>();
@ -2985,8 +2985,8 @@ struct server_context {

                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);

-                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep            , n_keep + n_discard);
+                llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.n_past,        -n_discard);

                // add generated tokens to cache
                {
@ -3189,8 +3189,8 @@ struct server_context {

                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;

-                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c);
+                                            llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift);

                                            for (size_t i = 0; i < n_match; i++) {
                                                slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
@ -3212,7 +3212,7 @@ struct server_context {
                            }

                            if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
-                                const auto pos_min = llama_kv_self_seq_pos_min(ctx, slot.id);
+                                const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                                if (pos_min == -1) {
                                    SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);
                                    GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
@ -3247,9 +3247,9 @@ struct server_context {
                    }

                    // keep only the common part
-                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.n_past, -1)) {
                        // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
+                        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);

                        // there is no common part left
                        slot.n_past = 0;
@ -3589,7 +3589,7 @@ struct server_context {
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert({ids.begin(), ids.end() - 1});

-                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.n_past, -1);

                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;