diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 22118faf..d7b269df 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -392,7 +392,7 @@ int main(int argc, char ** argv) { return 1; } - LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); n_cache_miss += 1; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 808fe599..57c7b422 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -424,28 +424,33 @@ const llama_kv_cache * llama_context::get_kv_self() const { return kv_self; } -void llama_context::kv_self_update() { +bool llama_context::kv_self_update() { if (!memory) { - return; + return false; } llama_kv_cache * kv_self = static_cast(memory.get()); - if (kv_self->update(*this)) { - // if the KV cache did any computation, we have to reserve a new worst-case graph - const auto kv_state = kv_self->init_full(); - if (!kv_state) { - throw std::runtime_error("failed to initialize KV cache"); - } - - const uint32_t n_seqs = cparams.n_seq_max; - const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get()); - if (!gf) { - LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__); - } + if (!kv_self->update(*this)) { + // no updates have been performed + return false; } + + // if the KV cache did any computation, we have to reserve a new worst-case graph + const auto kv_state = kv_self->init_full(); + if (!kv_state) { + throw std::runtime_error("failed to initialize KV cache"); + } + + const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get()); + if (!gf) { + LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__); + } + + return true; } enum llama_pooling_type llama_context::pooling_type() const { @@ -933,24 +938,44 @@ int llama_context::decode(llama_batch & inp_batch) { // handle any pending defrags/shifts kv_self_update(); - auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); - if (!kv_state) { - return -2; - } + llama_memory_state_ptr kv_state; - switch (kv_state->get_status()) { - case LLAMA_MEMORY_STATUS_SUCCESS: - { - } break; - case LLAMA_MEMORY_STATUS_FAILED_PREPARE: - { - // not a fatal error, we can re-try with a different batch - return 1; - } - case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: - { - return -2; - } + bool did_defrag = false; + + while (true) { + kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); + if (!kv_state) { + return -2; + } + + switch (kv_state->get_status()) { + case LLAMA_MEMORY_STATUS_SUCCESS: + { + } break; + case LLAMA_MEMORY_STATUS_FAILED_PREPARE: + { + if (!did_defrag) { + did_defrag = true; + + kv_self->defrag_sched(-1.0f); + if (kv_self_update()) { + LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens); + + continue; + } + } + + LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens); + + return 1; + } + case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: + { + return -2; + } + } + + break; } // reserve output buffer @@ -2646,22 +2671,8 @@ int32_t llama_encode( int32_t llama_decode( llama_context * ctx, llama_batch batch) { - int ret = ctx->decode(batch); - - // defrag and try again - // TODO: distinguish return code when we are sure that even after defrag there is no space available - if (ret == 1) { - llama_kv_self_defrag(ctx); - ret = ctx->decode(batch); - - if (ret == 1) { - LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens); - - return ret; - } - } - - if (ret != 0) { + const int ret = ctx->decode(batch); + if (ret != 0 && ret != 1) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } diff --git a/src/llama-context.h b/src/llama-context.h index 5b79bafa..3b880286 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -50,8 +50,9 @@ struct llama_context { llama_kv_cache * get_kv_self(); const llama_kv_cache * get_kv_self() const; + // return true of the KV cache was updated // TODO: remove - void kv_self_update(); + bool kv_self_update(); enum llama_pooling_type pooling_type() const; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 86c4f281..4726b700 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1809,9 +1809,10 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { GGML_UNUSED(embd_pooled); - auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); + // TODO: if we fail with split_simple, we should attempt different splitting strategies + // but to do that properly, we first have to refactor the batches to be more flexible - // TODO: if we fail with split_simple, we should attempt split_equal + auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); std::vector ubatches; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 90981ff9..46dbe5cc 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3431,7 +3431,7 @@ struct server_context { // retry with half the batch size to try to find a free slot in the KV cache n_batch /= 2; - SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); continue; // continue loop of n_batch }