Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425)
* Update brute force test: add_special * Update brute force test: default values for add_bos_token and add_eos_token * Enable rtrim when pre-inserting BOS Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Revert "server : fix test regexes"
This commit is contained in:
parent
917dc8cfa6
commit
d7e852c1bc
5 changed files with 28 additions and 23 deletions
11
llama.cpp
11
llama.cpp
|
@ -12498,15 +12498,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
static const bool rtrim = true; //TODO: as param
|
||||
bool is_prev_special = false;
|
||||
bool special_token_rtrim = false;
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
is_prev_special = true;
|
||||
}
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue