Added support for Viking pre-tokenizer (#8135)

Co-authored-by: kustaaya <kustaaya@protonmail.com>
This commit is contained in:
kustaaya 2024-06-27 11:58:54 +03:00 committed by GitHub
parent 911e35bb8b
commit f675b20a3b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 14 additions and 0 deletions

View file

@ -5067,6 +5067,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else if (
tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_VIKING:
regex_exprs = {
"\\p{N}",
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {