server: fix tool-call of DeepSeek R1 Qwen, return reasoning_content (Command 7RB & DeepSeek R1) unless --reasoning-format none (#11607)

* extract & return thoughts in reasoning_content field (unless --reasoning-format) for DeepSeek R1 & Command R7B * tool-calls: add deepseek r1 template (models/templates/llama-cpp-deepseek-r1.jinja) + hackommodate broken official template * tool-calls: accommodate variety of wrong tool call opening tags both R1 Qwen 32B and 7B distills like to spit out * server/oai: ensure content is null when there are tool calls, and reasoning_content appears before content for readability * tool-calls: add DeepSeek R1 Qwen distills to server/README.md & server tests Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-02-13 10:05:16 +00:00 · 2025-02-13 10:05:16 +00:00 · c7f460ab88
commit c7f460ab88
parent 27e8a23300
17 changed files with 1023 additions and 316 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -173,6 +173,7 @@ struct slot_params {
            {"grammar_trigger_words",     grammar_trigger_words},
            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
            {"preserved_tokens",          sampling.preserved_tokens},
+            {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@ -724,9 +725,19 @@ struct server_task_result_cmpl_final : server_task_result {
            msg.content = content;
        }

-        json tool_calls;
+        json message {
+            {"role", "assistant"},
+        };
+        if (!msg.reasoning_content.empty()) {
+            message["reasoning_content"] = msg.reasoning_content;
+        }
+        if (msg.content.empty() && !msg.tool_calls.empty()) {
+            message["content"] = json();
+        } else {
+            message["content"] = msg.content;
+        }
        if (!msg.tool_calls.empty()) {
-            tool_calls = json::array();
+            auto tool_calls = json::array();
            for (const auto & tc : msg.tool_calls) {
                tool_calls.push_back({
                    {"type", "function"},
@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
                    {"id", tc.id},
                });
            }
-        }
-
-        json message {
-            {"content", msg.content},
-            {"tool_calls", tool_calls},
-            {"role", "assistant"},
-        };
-        if (!msg.tool_plan.empty()) {
-            message["tool_plan"] = msg.tool_plan;
+            message["tool_calls"] = tool_calls;
        }

        json choice {
@ -4060,7 +4063,7 @@ int main(int argc, char ** argv) {
        }

        auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);

        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
@ -4073,7 +4076,7 @@ int main(int argc, char ** argv) {
    // same with handle_chat_completions, but without inference part
    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
        auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
    };