context : allow cache-less context for embeddings (#13108)
* context : allow cache-less context for embeddings ggml-ci * context : enable reranking with encode() ggml-ci * context : encode() clears embd_seq ggml-ci * examples : use llama_encode() when appropriate ggml-ci * models : nomic bert moe does not require KV cache * llama : update comments for llama_decode/llama_encode ggml-ci * context : update warning log [no ci]
This commit is contained in:
parent
51fb96b1ff
commit
6562e5a4d6
5 changed files with 47 additions and 23 deletions
|
@ -3214,7 +3214,14 @@ struct server_context {
|
|||
batch.logits + i,
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
int ret = 0;
|
||||
|
||||
if (params_base.embedding || params_base.reranking) {
|
||||
ret = llama_encode(ctx, batch_view);
|
||||
} else {
|
||||
ret = llama_decode(ctx, batch_view);
|
||||
}
|
||||
|
||||
metrics.on_decoded(slots);
|
||||
|
||||
if (ret != 0) {
|
||||
|
@ -3943,7 +3950,7 @@ int main(int argc, char ** argv) {
|
|||
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
||||
server_task_type type,
|
||||
json & data,
|
||||
std::function<bool()> is_connection_closed,
|
||||
const std::function<bool()> & is_connection_closed,
|
||||
httplib::Response & res,
|
||||
oaicompat_type oaicompat) {
|
||||
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue