From 06a92a193a07afe445929607be9d5e4d033956fb Mon Sep 17 00:00:00 2001 From: Clauszy Date: Wed, 5 Mar 2025 15:25:45 +0800 Subject: [PATCH] server : fix cache reuse logic (#12161) The first kv shift offsets the positions of all tokens after head_c. When using llama_kv_cache_seq_rm next, using head_c will remove the valid tokens because their positions have already been offset. --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2306dc26..e4f7e43f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3003,7 +3003,7 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); + llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];