server : added --no-prefill-assistant flag (#13608)

* added no-prefill-assistant flag * reworded documentation comment * updated server README.md
2025-05-17 17:59:48 -04:00 · 2025-05-17 17:59:48 -04:00 · 6a2bc8bfb7
commit 6a2bc8bfb7
parent e3a7cf6c5b
5 changed files with 17 additions and 1 deletions
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
    const json & body, /* openai api json semantics */
    bool use_jinja,
+    bool prefill_assistant,
    common_reasoning_format reasoning_format,
    const struct common_chat_templates * tmpls,
    bool allow_non_text,
@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse(

    // if the assistant message appears at the end of list, we do not add end-of-turn token
    // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
    common_chat_msg last_message;
    if (prefill_assistant_message) {
        last_message = inputs.messages.back();