Final touches

This commit is contained in:
Georgi Gerganov 2023-03-10 21:50:46 +02:00
parent 775328064e
commit 319cdb3e1f
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
5 changed files with 32 additions and 32 deletions

View file

@ -231,39 +231,39 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
}
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
auto res = gpt_tokenize(vocab, text);
if (bos) {
res.insert(res.begin(), 1); // TODO: replace with vocab.bos
}
//std::vector<gpt_vocab::id> res;
//auto res = gpt_tokenize(vocab, text);
//if (bos) {
// res.push_back(1); // TODO: replace with vocab.bos
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
//}
// find the longest token that matches the text
//int pos = 0;
//while (true) {
// int l = 0;
// int t = 0;
// for (const auto & kv : vocab.id_to_token) {
// if (kv.second.size() < l) continue;
// if (kv.second.size() > text.size() - pos) continue;
// if (text.substr(pos, kv.second.size()) == kv.second) {
// l = kv.second.size();
// t = kv.first;
// }
// }
std::vector<gpt_vocab::id> res;
// if (l == 0 && t != 13) {
// break;
// }
if (bos) {
res.push_back(1); // TODO: replace with vocab.bos
}
// res.push_back(t);
// pos += l;
//}
//find the longest token that matches the text
int pos = 0;
while (true) {
int l = 0;
int t = 0;
for (const auto & kv : vocab.id_to_token) {
if (kv.second.size() < l) continue;
if (kv.second.size() > text.size() - pos) continue;
if (text.substr(pos, kv.second.size()) == kv.second) {
l = kv.second.size();
t = kv.first;
}
}
if (l == 0 && t != 13) {
break;
}
res.push_back(t);
pos += l;
}
return res;
}