llama-vocab : add SuperBPE pre-tokenizer (#12532)

This commit is contained in:
compilade 2025-03-24 06:47:24 -04:00 committed by GitHub
parent 7ea75035b6
commit 00d53800e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 15 additions and 0 deletions

View file

@ -400,6 +400,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
regex_exprs = {
"\\p{N}+",
"(?=(\\d{3})+(?!\\d))",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -1604,6 +1610,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "gpt-4o") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
clean_spaces = false;
} else if (
tokenizer_pre == "superbpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
clean_spaces = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}