context : allow cache-less context for embeddings (#13108)

* context : allow cache-less context for embeddings ggml-ci * context : enable reranking with encode() ggml-ci * context : encode() clears embd_seq ggml-ci * examples : use llama_encode() when appropriate ggml-ci * models : nomic bert moe does not require KV cache * llama : update comments for llama_decode/llama_encode ggml-ci * context : update warning log [no ci]
2025-05-08 14:28:33 +03:00 · 2025-05-08 14:28:33 +03:00 · 6562e5a4d6
commit 6562e5a4d6
parent 51fb96b1ff
5 changed files with 47 additions and 23 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -3214,7 +3214,14 @@ struct server_context {
                batch.logits   + i,
            };

-            const int ret = llama_decode(ctx, batch_view);
+            int ret = 0;
+
+            if (params_base.embedding || params_base.reranking) {
+                ret = llama_encode(ctx, batch_view);
+            } else {
+                ret = llama_decode(ctx, batch_view);
+            }
+
            metrics.on_decoded(slots);

            if (ret != 0) {
@ -3943,7 +3950,7 @@ int main(int argc, char ** argv) {
    const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
            server_task_type type,
            json & data,
-            std::function<bool()> is_connection_closed,
+            const std::function<bool()> & is_connection_closed,
            httplib::Response & res,
            oaicompat_type oaicompat) {
        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);