llama : fix pre-tokenization of non-special added tokens (#8228)

* llama : fix mpt and olmo pre-tokenizer * llama : pre-tokenize non-special user-defined tokens first * llama : fix detection of control-like user-defined tokens * convert_hf : identify which user-defined tokens are control tokens Only used in _set_vocab_gpt2() for now. * convert_hf : identify more added control tokens for SPM tokenziers This makes Gemma and Gemma-2 tokenize pretty much EVERYTHING correctly, including HTML tags and consecutive spaces, but it unfortunately requires model re-conversion. There seems to be a weird behavior of the HF tokenizer for Gemma, which prefers to use the 16-space token over more lengthy space tokens, while using the SentencePiece tokenizer does not do this. (the implementation in llama.cpp has the same behavior as SentencePiece) * llama : fix wrong pre-tokenization of byte tokens * llama : fix Viking pre-tokenizer regex The order was previously wrong, which caused errors in some tests. * llama : fix command-r detokenization * convert_hf : reduce usages of the UNKNOWN token type * llama : add UNKNOWN tokens in the special tokens cache * convert_hf : reduce usages of UNKNOWN for InternLM2 This makes the changes from #8321 more consistent with the other changes made here. * test-tokenizer-random : reduce potential confilcts with #8379 * test-tokenizer-random : add a failing edge case for falcon
2024-07-13 23:35:10 -04:00 · 2024-07-13 23:35:10 -04:00 · fa79495bb4
commit fa79495bb4
parent 17eb6aa8a9
4 changed files with 91 additions and 61 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5419,6 +5419,7 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "command-r") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+                vocab.tokenizer_clean_spaces = false;
            } else if (
                tokenizer_pre == "qwen2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
@ -5652,7 +5653,7 @@ static void llm_load_vocab(
    // build special tokens cache
    {
        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
-            if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
+            if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
                vocab.cache_special_tokens.push_back(id);
            }
        }
@ -15411,17 +15412,6 @@ struct llm_tokenizer_bpe {
                    "[0-9][0-9][0-9]",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_MPT:
-                // TODO: MPT pre-tokenization regexes are unknown
-                //       the following are close, but not exact. run the following:
-                //       ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
-                GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
-                regex_exprs = {
-                    "\\s?\\p{L}+",
-                    "\\s?\\p{P}+",
-                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                };
-                break;
            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
            case LLAMA_VOCAB_PRE_TYPE_REFACT:
            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
@ -15431,6 +15421,7 @@ struct llm_tokenizer_bpe {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_GPT2:
+            case LLAMA_VOCAB_PRE_TYPE_MPT:
            case LLAMA_VOCAB_PRE_TYPE_OLMO:
            case LLAMA_VOCAB_PRE_TYPE_JAIS:
                regex_exprs = {
@ -15457,8 +15448,8 @@ struct llm_tokenizer_bpe {
                break;
            case LLAMA_VOCAB_PRE_TYPE_VIKING:
                regex_exprs = {
-                    "\\p{N}",
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                    "\\p{N}",
                };
                break;
            default:
@ -16178,12 +16169,20 @@ struct fragment_buffer_variant {

 // #define PRETOKENIZERDEBUG

-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
    // for each special token
    for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
        const auto & data = vocab.id_to_token[special_id];
        const auto & special_token = data.text;

+        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+            // Ignore control and unknown tokens when parse_special == false
+            continue;
+            // User-defined tokens are still pre-tokenized before everything else
+            // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
+            // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
+        }
+
        // for each text fragment
        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
        while (it != buffer.end()) {
@ -16296,7 +16295,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &

    if (!raw_text.empty()) {
        fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
-        if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
+        tokenizer_st_partition(vocab, fragment_buffer, parse_special);
    }

    switch (vocab.type) {