Added support for Viking pre-tokenizer (#8135)
Co-authored-by: kustaaya <kustaaya@protonmail.com>
This commit is contained in:
parent
911e35bb8b
commit
f675b20a3b
4 changed files with 14 additions and 0 deletions
|
@ -5067,6 +5067,9 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "poro-chat") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
||||
} else if (
|
||||
tokenizer_pre == "viking") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
|
|||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_VIKING:
|
||||
regex_exprs = {
|
||||
"\\p{N}",
|
||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue