Added support for Viking pre-tokenizer (#8135)

Co-authored-by: kustaaya <kustaaya@protonmail.com>
2024-06-27 11:58:54 +03:00 · 2024-06-27 11:58:54 +03:00 · f675b20a3b
commit f675b20a3b
parent 911e35bb8b
4 changed files with 14 additions and 0 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5067,6 +5067,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "poro-chat") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+            } else if (
+                tokenizer_pre == "viking") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    "\\p{N}",
+                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                };
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {