tool-call
: fix Qwen 2.5 Coder support, add micro benchmarks, support trigger patterns for lazy grammars (#12034)
* sampler: turn lazy grammar trigger words to regexes * add scripts/tool_bench.sh & .py * constrain llama json output regardless of function name if matches at beginning * update relaxed newline space rule in grammar tests * support add_generation_prompt query parameter (useful for /apply_template) * Update src/llama-grammar.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
fa31c438e0
commit
669912d9a5
26 changed files with 1314 additions and 408 deletions
|
@ -969,7 +969,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_words = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -978,19 +978,15 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const char ** trigger_patterns,
|
||||
size_t num_trigger_patterns,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
llama_grammar_parser parser;
|
||||
|
||||
// if there is a grammar, parse it
|
||||
if (!parser.parse(grammar_str)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// will be empty (default) if there are parse errors
|
||||
if (parser.rules.empty()) {
|
||||
// rules will be empty (default) if there are parse errors
|
||||
if (!parser.parse(grammar_str) || parser.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -1054,14 +1050,16 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
} while (true);
|
||||
|
||||
std::vector<llama_token> vec_trigger_tokens;
|
||||
std::vector<std::string> vec_trigger_words;
|
||||
std::vector<llama_grammar_trigger_pattern> vec_trigger_patterns;
|
||||
for (size_t i = 0; i < num_trigger_tokens; i++) {
|
||||
GGML_ASSERT(trigger_tokens != nullptr);
|
||||
vec_trigger_tokens.push_back(trigger_tokens[i]);
|
||||
}
|
||||
for (size_t i = 0; i < num_trigger_words; i++) {
|
||||
GGML_ASSERT(trigger_words != nullptr);
|
||||
vec_trigger_words.push_back(trigger_words[i]);
|
||||
for (size_t i = 0; i < num_trigger_patterns; i++) {
|
||||
GGML_ASSERT(trigger_patterns != nullptr);
|
||||
auto & trigger = vec_trigger_patterns.emplace_back();
|
||||
trigger.pattern = trigger_patterns[i];
|
||||
trigger.regex = std::regex(trigger.pattern);
|
||||
}
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
|
@ -1076,7 +1074,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
std::move(vec_trigger_tokens),
|
||||
std::move(vec_trigger_words),
|
||||
std::move(vec_trigger_patterns),
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -1089,7 +1087,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
|||
}
|
||||
|
||||
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
||||
llama_grammar * result = new llama_grammar {
|
||||
auto * result = new llama_grammar {
|
||||
grammar.vocab,
|
||||
grammar.rules,
|
||||
grammar.stacks,
|
||||
|
@ -1098,7 +1096,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|||
grammar.awaiting_trigger,
|
||||
grammar.trigger_buffer,
|
||||
grammar.trigger_tokens,
|
||||
grammar.trigger_words,
|
||||
grammar.trigger_patterns,
|
||||
};
|
||||
|
||||
// redirect elements in stacks to point to new rules
|
||||
|
@ -1173,16 +1171,18 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
||||
return;
|
||||
} else {
|
||||
// TODO: consider a smarter incremental substring search algorithm (store last position to search from).
|
||||
grammar.trigger_buffer += piece;
|
||||
for (const auto & word : grammar.trigger_words) {
|
||||
auto pos = grammar.trigger_buffer.find(word);
|
||||
if (pos != std::string::npos) {
|
||||
|
||||
std::smatch match;
|
||||
for (const auto & trigger_pattern : grammar.trigger_patterns) {
|
||||
if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
|
||||
grammar.awaiting_trigger = false;
|
||||
auto constrained_str = grammar.trigger_buffer.substr(pos);
|
||||
// get from the first match to the end of the string
|
||||
auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
|
||||
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, constrained_str);
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
@ -105,6 +106,11 @@ struct llama_grammar_parser {
|
|||
void print(FILE * file);
|
||||
};
|
||||
|
||||
struct llama_grammar_trigger_pattern {
|
||||
std::string pattern;
|
||||
std::regex regex;
|
||||
};
|
||||
|
||||
struct llama_grammar {
|
||||
// note: allow null vocab for testing (not great)
|
||||
const llama_vocab * vocab;
|
||||
|
@ -122,7 +128,10 @@ struct llama_grammar {
|
|||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||
std::vector<std::string> trigger_words;
|
||||
std::vector<llama_grammar_trigger_pattern>
|
||||
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
||||
// string, and the grammar will be given the string from the first match group onwards.
|
||||
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -141,8 +150,8 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const char ** trigger_patterns,
|
||||
size_t num_trigger_patterns,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens);
|
||||
|
||||
|
|
|
@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens);
|
||||
size_t num_trigger_tokens,
|
||||
const char ** trigger_patterns,
|
||||
size_t num_trigger_patterns);
|
||||
|
||||
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
|
||||
|
@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|||
return;
|
||||
}
|
||||
|
||||
std::vector<const char *> trigger_words;
|
||||
for (auto & word : ctx->grammar->trigger_words) {
|
||||
trigger_words.push_back(word.c_str());
|
||||
std::vector<const char *> trigger_patterns_c;
|
||||
trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
|
||||
for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
|
||||
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
||||
ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
|
||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
llama_grammar_free_impl(ctx->grammar);
|
||||
|
@ -1472,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|||
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
||||
|
||||
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
||||
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
||||
|
||||
// copy the state
|
||||
{
|
||||
|
@ -1516,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|||
const char ** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
size_t num_trigger_tokens,
|
||||
const char ** trigger_patterns,
|
||||
size_t num_trigger_patterns) {
|
||||
auto * ctx = new llama_sampler_grammar;
|
||||
|
||||
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
||||
// TODO: remove trigger_words support.
|
||||
if (trigger_words != nullptr && num_trigger_words > 0) {
|
||||
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
|
||||
std::string trigger_pattern("[\\s\\S]*?(");
|
||||
for (size_t i = 0; i < num_trigger_words; ++i) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
if (i > 0) {
|
||||
trigger_pattern += "|";
|
||||
}
|
||||
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
||||
}
|
||||
trigger_pattern += ")[\\s\\S]*";
|
||||
auto trigger_pattern_c = trigger_pattern.c_str();
|
||||
trigger_patterns = &trigger_pattern_c;
|
||||
num_trigger_patterns = 1;
|
||||
}
|
||||
*ctx = {
|
||||
/* .vocab = */ vocab,
|
||||
/* .grammar_str = */ grammar_str,
|
||||
/* .grammar_root = */ grammar_root,
|
||||
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
|
||||
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
||||
};
|
||||
} else {
|
||||
*ctx = {
|
||||
|
@ -1545,7 +1567,7 @@ struct llama_sampler * llama_sampler_init_grammar(
|
|||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_lazy(
|
||||
|
@ -1556,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|||
size_t num_trigger_words,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
|
||||
const struct llama_vocab * vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
const char ** trigger_patterns,
|
||||
size_t num_trigger_patterns,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
|
||||
}
|
||||
|
||||
// penalties
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue