llama-vocab : add SuperBPE pre-tokenizer (#12532)

2025-03-24 06:47:24 -04:00 · 2025-03-24 06:47:24 -04:00 · 00d53800e0
commit 00d53800e0
parent 7ea75035b6
4 changed files with 15 additions and 0 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -400,6 +400,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
+                regex_exprs = {
+                    "\\p{N}+",
+                    "(?=(\\d{3})+(?!\\d))",
+                };
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -1604,6 +1610,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "gpt-4o") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "superbpe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
+                clean_spaces = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }