Unicode codepoint flags for custom regexs (#7245)

* Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM
2024-05-18 01:09:13 +02:00 · 2024-05-18 01:09:13 +02:00 · b43272afa2
commit b43272afa2
parent 0fc1e820a9
7 changed files with 7299 additions and 2409 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -12576,16 +12576,16 @@ struct llm_tokenizer_wpm {
        // to lowercase, pad chinese characters, pad punctuation
        std::string new_str = "";
        for (uint32_t code : cpts_nfd) {
-            int type = unicode_cpt_type(code);
-            if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
+            const codepoint_flags flags = unicode_cpt_flags(code);
+            if (flags.is_accent_mark || flags.is_control) {
                continue;
            }
            code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_SEPARATOR) {
+            if (flags.is_separator || flags.is_whitespace) {  //####FIXME: is_separator ?
                code = ' ';
            }
            std::string s = unicode_cpt_to_utf8(code);
-            if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
+            if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
                new_str += " ";
                new_str += s;
                new_str += " ";