llama-vocab : add SuperBPE pre-tokenizer (#12532)

This commit is contained in:
compilade 2025-03-24 06:47:24 -04:00 committed by GitHub
parent 7ea75035b6
commit 00d53800e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 15 additions and 0 deletions

View file

@ -110,6 +110,7 @@ models = [
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
]