tool-call: refactor common chat / tool-call api (+ tests / fixes) (#11900)
* tool-call refactoring: moved common_chat_* to chat.h, common_chat_templates_init return a unique_ptr to opaque type * addressed clang-tidy lints in [test-]chat.* * rm minja deps from util & common & move it to common/minja/ * add name & tool_call_id to common_chat_msg * add common_chat_tool * added json <-> tools, msgs conversions to chat.h * fix double bos/eos jinja avoidance hack (was preventing inner bos/eos tokens) * fix deepseek r1 slow test (no longer <think> opening w/ new template) * allow empty tools w/ auto + grammar * fix & test server grammar & json_schema params w/ & w/o --jinja
This commit is contained in:
parent
63ac128563
commit
63e489c025
18 changed files with 1385 additions and 993 deletions
|
@ -4,7 +4,7 @@
|
|||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
#include "chat-template.hpp"
|
||||
#include "chat.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
|
||||
auto chat_templates = common_chat_templates_init(model, params.chat_template);
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
|
@ -201,7 +201,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// auto enable conversation mode if chat template is available
|
||||
const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
|
||||
const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
|
||||
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
|
||||
if (has_chat_template) {
|
||||
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
|
||||
|
@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
|
|||
// print chat template example in conversation mode
|
||||
if (params.conversation_mode) {
|
||||
if (params.enable_chat_template) {
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
|
||||
} else {
|
||||
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
|
@ -264,9 +264,11 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_token> embd_inp;
|
||||
|
||||
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg{role, content, {}};
|
||||
auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
|
||||
chat_msgs.push_back({role, content, {}});
|
||||
common_chat_msg new_msg;
|
||||
new_msg.role = role;
|
||||
new_msg.content = content;
|
||||
auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
|
||||
chat_msgs.push_back(new_msg);
|
||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||
return formatted;
|
||||
};
|
||||
|
@ -755,11 +757,14 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// check for reverse prompt using special tokens
|
||||
llama_token last_token = common_sampler_last(smpl);
|
||||
if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
for (auto token : antiprompt_token) {
|
||||
if (token == last_token) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
break;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
}
|
||||
|
||||
if (is_antiprompt) {
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "chat-template.hpp"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "json.hpp"
|
||||
#include "linenoise.cpp/linenoise.h"
|
||||
|
@ -557,7 +557,7 @@ class LlamaData {
|
|||
llama_model_ptr model;
|
||||
llama_sampler_ptr sampler;
|
||||
llama_context_ptr context;
|
||||
std::vector<llama_chat_message> messages;
|
||||
std::vector<llama_chat_message> messages; // TODO: switch to common_chat_msg
|
||||
std::list<std::string> msg_strs;
|
||||
std::vector<char> fmtted;
|
||||
|
||||
|
@ -834,44 +834,23 @@ static void add_message(const char * role, const std::string & text, LlamaData &
|
|||
}
|
||||
|
||||
// Function to apply the chat template and resize `formatted` if needed
|
||||
static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
json messages = json::array();
|
||||
for (const auto & msg : llama_data.messages) {
|
||||
messages.push_back({
|
||||
{"role", msg.role},
|
||||
{"content", msg.content},
|
||||
});
|
||||
}
|
||||
try {
|
||||
minja::chat_template_inputs tmpl_inputs;
|
||||
tmpl_inputs.messages = messages;
|
||||
tmpl_inputs.add_generation_prompt = append;
|
||||
|
||||
minja::chat_template_options tmpl_opts;
|
||||
tmpl_opts.use_bos_token = false;
|
||||
tmpl_opts.use_eos_token = false;
|
||||
|
||||
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
|
||||
llama_data.fmtted.resize(result.size() + 1);
|
||||
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
|
||||
return result.size();
|
||||
} catch (const std::exception & e) {
|
||||
printe("failed to render the chat template: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
int result = llama_chat_apply_template(
|
||||
tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
|
||||
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
|
||||
if (append && result > static_cast<int>(llama_data.fmtted.size())) {
|
||||
llama_data.fmtted.resize(result);
|
||||
result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, llama_data.fmtted.data(),
|
||||
llama_data.fmtted.size());
|
||||
static int apply_chat_template(const struct common_chat_templates * tmpls, LlamaData & llama_data, const bool append, bool use_jinja) {
|
||||
common_chat_templates_inputs inputs;
|
||||
for (const auto & msg : llama_data.messages) {
|
||||
common_chat_msg cmsg;
|
||||
cmsg.role = msg.role;
|
||||
cmsg.content = msg.content;
|
||||
inputs.messages.push_back(cmsg);
|
||||
}
|
||||
inputs.add_generation_prompt = append;
|
||||
inputs.use_jinja = use_jinja;
|
||||
|
||||
return result;
|
||||
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
||||
// TODO: use other params for tool calls.
|
||||
auto result = chat_params.prompt;
|
||||
llama_data.fmtted.resize(result.size() + 1);
|
||||
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
|
||||
return result.size();
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
|
@ -1015,8 +994,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
|||
}
|
||||
|
||||
// Helper function to apply the chat template and handle errors
|
||||
static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
|
||||
const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
|
||||
static int apply_chat_template_with_error_handling(const common_chat_templates * tmpls, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
|
||||
const int new_len = apply_chat_template(tmpls, llama_data, append, use_jinja);
|
||||
if (new_len < 0) {
|
||||
printe("failed to apply the chat template\n");
|
||||
return -1;
|
||||
|
@ -1078,8 +1057,7 @@ static int get_user_input(std::string & user_input, const std::string & user) {
|
|||
static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
|
||||
int prev_len = 0;
|
||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||
auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
|
||||
GGML_ASSERT(chat_templates.template_default);
|
||||
auto chat_templates = common_chat_templates_init(llama_data.model.get(), "");
|
||||
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
||||
while (true) {
|
||||
// Get user input
|
||||
|
@ -1090,7 +1068,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
|
|||
|
||||
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||
int new_len;
|
||||
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
|
||||
if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, true, new_len, use_jinja) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -1105,7 +1083,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
|
|||
}
|
||||
|
||||
add_message("assistant", response, llama_data);
|
||||
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
|
||||
if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, false, prev_len, use_jinja) < 0) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -329,9 +329,6 @@ struct server_task {
|
|||
}
|
||||
|
||||
// process "json_schema" and "grammar"
|
||||
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
||||
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
||||
}
|
||||
if (data.contains("json_schema") && !data.contains("grammar")) {
|
||||
try {
|
||||
auto schema = json_value(data, "json_schema", json::object());
|
||||
|
@ -1807,7 +1804,7 @@ struct server_context {
|
|||
// Necessary similarity of prompt for slot selection
|
||||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
common_chat_templates chat_templates;
|
||||
common_chat_templates_ptr chat_templates;
|
||||
|
||||
~server_context() {
|
||||
// Clear any sampling context
|
||||
|
@ -1891,45 +1888,17 @@ struct server_context {
|
|||
llama_init_dft.context.reset();
|
||||
}
|
||||
|
||||
if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
|
||||
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
||||
try {
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
chat_templates = common_chat_templates_from_model(model, "chatml");
|
||||
} else {
|
||||
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
|
||||
chat_templates = common_chat_templates_init(model, "chatml");
|
||||
}
|
||||
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool validate_builtin_chat_template(bool use_jinja) const {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
|
||||
if (use_jinja) {
|
||||
auto templates = common_chat_templates_from_model(model, "");
|
||||
common_chat_inputs inputs;
|
||||
inputs.messages = json::array({{
|
||||
{"role", "user"},
|
||||
{"content", "test"},
|
||||
}});
|
||||
GGML_ASSERT(templates.template_default);
|
||||
try {
|
||||
common_chat_params_init(*templates.template_default, inputs);
|
||||
if (templates.template_tool_use) {
|
||||
common_chat_params_init(*templates.template_tool_use, inputs);
|
||||
}
|
||||
return true;
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("failed to apply template: %s\n", e.what());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
||||
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
||||
return chat_res > 0;
|
||||
}
|
||||
}
|
||||
|
||||
void init() {
|
||||
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
||||
|
||||
|
@ -3822,13 +3791,15 @@ int main(int argc, char ** argv) {
|
|||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "chat_template", ctx_server.chat_templates.template_default->source() },
|
||||
{ "bos_token", ctx_server.chat_templates.template_default->bos_token() },
|
||||
{ "eos_token", ctx_server.chat_templates.template_default->eos_token() },
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "build_info", build_info },
|
||||
};
|
||||
if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
|
||||
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
|
||||
if (ctx_server.params_base.use_jinja) {
|
||||
if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
|
||||
data["chat_template_tool_use"] = tool_use_src;
|
||||
}
|
||||
}
|
||||
|
||||
res_ok(res, data);
|
||||
|
@ -4063,7 +4034,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
auto body = json::parse(req.body);
|
||||
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
||||
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
||||
|
||||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
|
@ -4076,7 +4047,7 @@ int main(int argc, char ** argv) {
|
|||
// same with handle_chat_completions, but without inference part
|
||||
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
auto body = json::parse(req.body);
|
||||
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
|
||||
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
|
||||
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
||||
};
|
||||
|
||||
|
@ -4493,8 +4464,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// print sample chat example to make it clear which template is used
|
||||
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
||||
ctx_server.chat_templates.template_default->source().c_str(),
|
||||
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
|
||||
common_chat_templates_source(ctx_server.chat_templates.get()),
|
||||
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
||||
ctx_server.process_single_task(task);
|
||||
|
|
|
@ -21,6 +21,8 @@ def create_server():
|
|||
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
|
||||
(None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", False, None),
|
||||
(None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", True, None),
|
||||
]
|
||||
)
|
||||
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
|
||||
|
@ -44,7 +46,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
|||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||
choice = res.body["choices"][0]
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert match_regex(re_content, choice["message"]["content"])
|
||||
assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}'
|
||||
assert choice["finish_reason"] == finish_reason
|
||||
|
||||
|
||||
|
@ -169,6 +171,47 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
|
|||
assert "error" in res.body
|
||||
|
||||
|
||||
@pytest.mark.parametrize("jinja,json_schema,n_predicted,re_content", [
|
||||
(False, {"const": "42"}, 6, "\"42\""),
|
||||
(True, {"const": "42"}, 6, "\"42\""),
|
||||
])
|
||||
def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
|
||||
global server
|
||||
server.jinja = jinja
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": n_predicted,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a coding assistant."},
|
||||
{"role": "user", "content": "Write an example"},
|
||||
],
|
||||
"json_schema": json_schema,
|
||||
})
|
||||
assert res.status_code == 200, f'Expected 200, got {res.status_code}'
|
||||
choice = res.body["choices"][0]
|
||||
assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}'
|
||||
|
||||
|
||||
@pytest.mark.parametrize("jinja,grammar,n_predicted,re_content", [
|
||||
(False, 'root ::= "a"{5,5}', 6, "a{5,5}"),
|
||||
(True, 'root ::= "a"{5,5}', 6, "a{5,5}"),
|
||||
])
|
||||
def test_completion_with_grammar(jinja: bool, grammar: str, n_predicted: int, re_content: str):
|
||||
global server
|
||||
server.jinja = jinja
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": n_predicted,
|
||||
"messages": [
|
||||
{"role": "user", "content": "Does not matter what I say, does it?"},
|
||||
],
|
||||
"grammar": grammar,
|
||||
})
|
||||
assert res.status_code == 200, res.body
|
||||
choice = res.body["choices"][0]
|
||||
assert match_regex(re_content, choice["message"]["content"]), choice["message"]["content"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("messages", [
|
||||
None,
|
||||
"string",
|
||||
|
|
|
@ -356,12 +356,12 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
|
|||
(None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
|
||||
(None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
|
||||
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
|
||||
("^> 0.56$", 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
|
||||
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
# TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
|
||||
("^The y-coordinate [\\s\\S]*?\\*\\*0.5\\*\\*", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
("[\\s\\S]*?\\*\\*0\\.5\\*\\*", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
# ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
])
|
||||
def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
|
||||
global server
|
||||
|
@ -401,7 +401,7 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
|||
{
|
||||
"role": "tool",
|
||||
"name": "calculate",
|
||||
"content": 0.55644242476,
|
||||
"content": "0.55644242476",
|
||||
"tool_call_id": "call_6789"
|
||||
}
|
||||
],
|
||||
|
@ -444,7 +444,7 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
|||
(128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
|
||||
|
||||
(1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'none', "<think>\n?I need[\\s\\S]*?</think>\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
(1024, 'none', "^I need[\\s\\S]*?</think>\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
|
||||
|
||||
(1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
|
||||
])
|
||||
|
|
|
@ -12,9 +12,7 @@
|
|||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "minja.hpp"
|
||||
#include "chat.hpp"
|
||||
#include "chat-template.hpp"
|
||||
#include "chat.h"
|
||||
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
|
@ -347,41 +345,6 @@ static llama_tokens format_infill(
|
|||
return embd_inp;
|
||||
}
|
||||
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
||||
std::vector<common_chat_msg> chat;
|
||||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
const auto & curr_msg = messages[i];
|
||||
|
||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||
|
||||
std::string content;
|
||||
if (curr_msg.contains("content")) {
|
||||
if (curr_msg["content"].is_string()) {
|
||||
content = curr_msg["content"].get<std::string>();
|
||||
} else if (curr_msg["content"].is_array()) {
|
||||
for (const auto & part : curr_msg["content"]) {
|
||||
if (part.contains("text")) {
|
||||
content += "\n" + part["text"].get<std::string>();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
|
||||
chat.push_back({role, content, /* tool_calls= */ {}});
|
||||
}
|
||||
|
||||
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
||||
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
||||
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
//
|
||||
// base64 utils (TODO: move to common in the future)
|
||||
//
|
||||
|
@ -579,12 +542,9 @@ static json oaicompat_completion_params_parse(
|
|||
const json & body, /* openai api json semantics */
|
||||
bool use_jinja,
|
||||
common_reasoning_format reasoning_format,
|
||||
const common_chat_templates & chat_templates)
|
||||
const struct common_chat_templates * tmpls)
|
||||
{
|
||||
json llama_params;
|
||||
const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
|
||||
? *chat_templates.template_tool_use
|
||||
: *chat_templates.template_default;
|
||||
|
||||
auto tools = json_value(body, "tools", json());
|
||||
auto stream = json_value(body, "stream", false);
|
||||
|
@ -610,62 +570,58 @@ static json oaicompat_completion_params_parse(
|
|||
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||
}
|
||||
|
||||
auto json_schema = json_value(body, "json_schema", json());
|
||||
auto grammar = json_value(body, "grammar", std::string());
|
||||
if (!json_schema.is_null() && !grammar.empty()) {
|
||||
throw std::runtime_error("Cannot use both json_schema and grammar");
|
||||
}
|
||||
|
||||
// Handle "response_format" field
|
||||
if (body.contains("response_format")) {
|
||||
json response_format = json_value(body, "response_format", json::object());
|
||||
std::string response_type = json_value(response_format, "type", std::string());
|
||||
if (response_type == "json_object") {
|
||||
llama_params["json_schema"] = json_value(response_format, "schema", json::object());
|
||||
json_schema = json_value(response_format, "schema", json::object());
|
||||
} else if (response_type == "json_schema") {
|
||||
json json_schema = json_value(response_format, "json_schema", json::object());
|
||||
llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
|
||||
json_schema = json_value(json_schema, "schema", json::object());
|
||||
} else if (!response_type.empty() && response_type != "text") {
|
||||
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
if (use_jinja) {
|
||||
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
|
||||
if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
|
||||
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
||||
}
|
||||
if (tool_choice != "none" && llama_params.contains("grammar")) {
|
||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||
}
|
||||
common_chat_inputs inputs;
|
||||
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
inputs.messages = body.at("messages");
|
||||
inputs.tools = tools;
|
||||
inputs.tool_choice = tool_choice;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
|
||||
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
|
||||
inputs.parallel_tool_calls = false;
|
||||
}
|
||||
inputs.stream = stream;
|
||||
// TODO: support mixing schema w/ tools beyond generic format.
|
||||
inputs.json_schema = json_value(llama_params, "json_schema", json());
|
||||
auto chat_params = common_chat_params_init(tmpl, inputs);
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
|
||||
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
||||
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
||||
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
||||
inputs.grammar = grammar;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||
}
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
grammar_triggers.push_back({
|
||||
{"word", trigger.word},
|
||||
{"at_start", trigger.at_start},
|
||||
});
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
for (const auto & stop : chat_params.additional_stops) {
|
||||
llama_params["stop"].push_back(stop);
|
||||
}
|
||||
} else {
|
||||
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
llama_params["grammar"] = chat_params.grammar;
|
||||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
grammar_triggers.push_back({
|
||||
{"word", trigger.word},
|
||||
{"at_start", trigger.at_start},
|
||||
});
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
for (const auto & stop : chat_params.additional_stops) {
|
||||
llama_params["stop"].push_back(stop);
|
||||
}
|
||||
|
||||
// Handle "n" field
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue