llama : add pre-tokenizer regexes for BLOOM and gpt3-finnish (#8850)
This commit is contained in:
parent
d5492f0525
commit
6bda7ce6c3
5 changed files with 19 additions and 1 deletions
|
@ -94,6 +94,8 @@ models = [
|
|||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||
]
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue