From 9f47fa5792bae5312615439e68dbf1826913f7ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 5 Jun 2025 09:29:18 +0200 Subject: [PATCH] vocab : warn about missing mask token (#14022) --- src/llama-vocab.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index b5197669..ba2e1864 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2098,7 +2098,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) || _contains_any(general_arch, {"nomic-bert-moe"}) ) { - _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); + if (token_to_id.count("") == 0) { + LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__); + } else { + _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); + } } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : cache_special_tokens) { _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);