context : allow cache-less context for embeddings (#13108)

* context : allow cache-less context for embeddings

ggml-ci

* context : enable reranking with encode()

ggml-ci

* context : encode() clears embd_seq

ggml-ci

* examples : use llama_encode() when appropriate

ggml-ci

* models : nomic bert moe does not require KV cache

* llama : update comments for llama_decode/llama_encode

ggml-ci

* context : update warning log [no ci]
This commit is contained in:
Georgi Gerganov 2025-05-08 14:28:33 +03:00 committed by GitHub
parent 51fb96b1ff
commit 6562e5a4d6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 47 additions and 23 deletions

View file

@ -3214,7 +3214,14 @@ struct server_context {
batch.logits + i,
};
const int ret = llama_decode(ctx, batch_view);
int ret = 0;
if (params_base.embedding || params_base.reranking) {
ret = llama_encode(ctx, batch_view);
} else {
ret = llama_decode(ctx, batch_view);
}
metrics.on_decoded(slots);
if (ret != 0) {
@ -3943,7 +3950,7 @@ int main(int argc, char ** argv) {
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type,
json & data,
std::function<bool()> is_connection_closed,
const std::function<bool()> & is_connection_closed,
httplib::Response & res,
oaicompat_type oaicompat) {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);