tool-call: fix Qwen 2.5 Coder support, add micro benchmarks, support trigger patterns for lazy grammars (#12034)

* sampler: turn lazy grammar trigger words to regexes

* add scripts/tool_bench.sh & .py

* constrain llama json output regardless of function name if matches at beginning

* update relaxed newline space rule in grammar tests

* support add_generation_prompt query parameter (useful for /apply_template)

* Update src/llama-grammar.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Olivier Chafik 2025-03-05 13:05:13 +00:00 committed by GitHub
parent fa31c438e0
commit 669912d9a5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 1314 additions and 408 deletions

View file

@ -67,6 +67,9 @@ class ServerProcess:
id_slot: int | None = None
cache_prompt: bool | None = None
n_slots: int | None = None
ctk: str | None = None
ctv: str | None = None
fa: bool | None = None
server_continuous_batching: bool | None = False
server_embeddings: bool | None = False
server_reranking: bool | None = False
@ -84,6 +87,7 @@ class ServerProcess:
reasoning_format: Literal['deepseek', 'none'] | None = None
chat_template: str | None = None
chat_template_file: str | None = None
server_path: str | None = None
# session variables
process: subprocess.Popen | None = None
@ -97,7 +101,9 @@ class ServerProcess:
self.server_port = int(os.environ["PORT"])
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
if "LLAMA_SERVER_BIN_PATH" in os.environ:
if self.server_path is not None:
server_path = self.server_path
elif "LLAMA_SERVER_BIN_PATH" in os.environ:
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
elif os.name == "nt":
server_path = "../../../build/bin/Release/llama-server.exe"
@ -151,6 +157,12 @@ class ServerProcess:
server_args.extend(["--ctx-size", self.n_ctx])
if self.n_slots:
server_args.extend(["--parallel", self.n_slots])
if self.ctk:
server_args.extend(["-ctk", self.ctk])
if self.ctv:
server_args.extend(["-ctv", self.ctv])
if self.fa is not None:
server_args.append("-fa")
if self.n_predict:
server_args.extend(["--n-predict", self.n_predict])
if self.slot_save_path:
@ -184,7 +196,7 @@ class ServerProcess:
server_args.extend(["--chat-template-file", self.chat_template_file])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
print(f"tests: starting server with: {' '.join(args)}")
flags = 0
if "nt" == os.name:
@ -215,6 +227,10 @@ class ServerProcess:
return # server is ready
except Exception as e:
pass
# Check if process died
if self.process.poll() is not None:
raise RuntimeError(f"Server process died with return code {self.process.returncode}")
print(f"Waiting for server to start...")
time.sleep(0.5)
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")