common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
This commit is contained in:
Georgi Gerganov 2024-09-15 20:46:12 +03:00 committed by GitHub
parent e6deac31f7
commit 6262d13e0b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
54 changed files with 2092 additions and 2419 deletions

View file

@ -1,5 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
@ -8,9 +9,9 @@
#include <vector>
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
@ -20,6 +21,8 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = params.n_pp;
@ -76,7 +79,7 @@ int main(int argc, char ** argv) {
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
@ -93,17 +96,17 @@ int main(int argc, char ** argv) {
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
if (!params.batched_bench_output_jsonl) {
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
LOG("\n");
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG("\n");
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
}
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@ -133,7 +136,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -155,7 +158,7 @@ int main(int argc, char ** argv) {
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
@ -173,20 +176,20 @@ int main(int argc, char ** argv) {
const float speed = n_kv / t;
if(params.batched_bench_output_jsonl) {
LOG_TEE(
LOG(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
);
} else {
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
}
}
}
}
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
llama_batch_free(batch);
@ -196,7 +199,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -1,5 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
@ -8,9 +9,9 @@
#include <vector>
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
@ -23,6 +24,7 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
// number of parallel batches
int n_parallel = params.n_parallel;
@ -42,7 +44,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1;
}
@ -72,31 +74,29 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token
fprintf(stderr, "\n");
LOG("\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
// create a llama_batch
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
//}
if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
}
// main loop
@ -175,9 +175,9 @@ int main(int argc, char ** argv) {
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
LOG("\n");
if (n_parallel > 1) {
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
}
continue;
@ -185,8 +185,7 @@ int main(int argc, char ** argv) {
// if there is only one stream, we print immediately to stdout
if (n_parallel == 1) {
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
}
streams[i] += llama_token_to_piece(ctx, new_token_id);
@ -208,27 +207,25 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
if (n_parallel > 1) {
LOG_TEE("\n");
LOG("\n");
for (int32_t i = 0; i < n_parallel; ++i) {
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
}
}
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n");
LOG("\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);

View file

@ -9,6 +9,7 @@
#include <climits>
#include <cstring>
#include <cstdarg>
#include <cinttypes>
#include <ctime>
#include <random>
#include <stdexcept>
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try {
w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = {};
} else {
w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
catch (std::length_error &) {
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1;
}
@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
}
static void print_sample_weights(TransformerWeights *w){
LOG("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]);
LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
LOG_INF("%f\n", w->token_embedding_table[0]);
LOG_INF("%f\n", w->rms_att_weight[0]);
LOG_INF("%f\n", w->rms_ffn_weight[0]);
LOG("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
LOG_INF("%f\n", w->wq[0]);
LOG_INF("%f\n", w->wk[0]);
LOG_INF("%f\n", w->wv[0]);
LOG_INF("%f\n", w->wo[0]);
LOG_INF("%f\n", w->w1[0]);
LOG_INF("%f\n", w->w2[0]);
LOG_INF("%f\n", w->w3[0]);
LOG_INF("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -318,20 +319,20 @@ struct train_params {
};
static void print_params(struct my_llama_hparams * params) {
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__);
LOG_INF("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
}
int main(int argc, char ** argv) {
gpt_init();
struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) {
return 1;
}
log_set_target(stdout);
Config config;
TransformerWeights weights = {};
{
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1;
}
// read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1;
}
auto shared_weights = config.vocab_size > 0;
@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
// read in the Transformer weights
alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1;
}
fclose(file);
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;

View file

@ -13,14 +13,15 @@
#include "ggml-metal.h"
#endif
#include <algorithm>
#include <climits>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
//////////////////////////////////////////////////

View file

@ -1,5 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <ctime>
@ -39,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
// encoder-only model
if (llama_encode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to encode\n", __func__);
LOG_ERR("%s : failed to encode\n", __func__);
}
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model
if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__);
LOG_ERR("%s : failed to decode\n", __func__);
}
}
@ -84,12 +85,12 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
print_build_info();
llama_backend_init();
llama_numa_init(params.numa);
@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
@ -109,19 +110,19 @@ int main(int argc, char ** argv) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
// split the prompt into lines
@ -136,7 +137,7 @@ int main(int argc, char ** argv) {
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false);
if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
}
@ -147,20 +148,20 @@ int main(int argc, char ** argv) {
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) {
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
}
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int j = 0; j < (int) inputs[i].size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
}
fprintf(stderr, "\n\n");
LOG("\n\n");
}
}
@ -211,57 +212,57 @@ int main(int argc, char ** argv) {
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
fprintf(stdout, "\n");
LOG("\n");
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int j = 0; j < n_embd_count; j++) {
fprintf(stdout, "embedding %d: ", j);
LOG("embedding %d: ", j);
for (int i = 0; i < std::min(3, n_embd); i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
fprintf(stdout, " ... ");
LOG(" ... ");
for (int i = n_embd - 3; i < n_embd; i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
fprintf(stdout, "\n");
LOG("\n");
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
LOG("embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
fprintf(stdout, "\n");
LOG("\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
LOG("\n");
LOG("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str());
LOG("%6.6s ", prompts[i].c_str());
}
fprintf(stdout, "\n");
LOG("\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
LOG("%6.2f ", sim);
}
fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n");
LOG("%1.10s", prompts[i].c_str());
LOG("\n");
}
}
}
@ -270,42 +271,42 @@ int main(int argc, char ** argv) {
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "[");
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
LOG("[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd) fprintf(stdout, ","); else break;
if (i < n_embd) LOG(","); else break;
}
fprintf(stdout, notArray ? "]\n }" : "]");
LOG(notArray ? "]\n }" : "]");
j++;
if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
LOG(notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
LOG(",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
fprintf(stdout, " [");
LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
LOG("%6.2f", sim);
j++;
if (j < n_embd_count) fprintf(stdout, ", "); else break;
if (j < n_embd_count) LOG(", "); else break;
}
fprintf(stdout, " ]");
LOG(" ]");
i++;
if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
if (i < n_embd_count) LOG(",\n"); else break;
}
fprintf(stdout, "\n ]");
LOG("\n ]");
}
if (notArray) fprintf(stdout, "\n}\n");
if (notArray) LOG("\n}\n");
}
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
// clean up

View file

@ -1,12 +1,11 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <random>
#include <string>
#include <tuple>
#include <vector>
/**
@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n");
LOG(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
LOG(" ..., \n");
i2 = ne[2] - n;
}
printf(" [\n");
LOG(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n");
LOG(" ..., \n");
i1 = ne[1] - n;
}
printf(" [");
LOG(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
printf("..., ");
LOG("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else {
GGML_ABORT("fatal error");
}
printf("%12.4f", v);
LOG("%12.4f", v);
sum += v;
if (i0 < ne[0] - 1) printf(", ");
if (i0 < ne[0] - 1) LOG(", ");
}
printf("],\n");
LOG("],\n");
}
printf(" ],\n");
LOG(" ],\n");
}
printf(" ]\n");
printf(" sum = %f\n", sum);
LOG(" ]\n");
LOG(" sum = %f\n", sum);
}
}
@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
@ -149,7 +148,7 @@ int main(int argc, char ** argv) {
return 1;
}
print_build_info();
gpt_init();
llama_backend_init();
llama_numa_init(params.numa);
@ -166,14 +165,15 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
}
bool OK = run(ctx, params);
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
return 1;
}
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);

View file

@ -406,7 +406,7 @@ int main(int argc, char ** argv) {
return 1;
}
g_verbose = (params.verbosity == 1);
g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();

View file

@ -158,6 +158,8 @@ int main(int argc, char * argv[]) {
return 1;
}
gpt_init();
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params);

View file

@ -1,5 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cmath>
@ -19,12 +20,12 @@
#endif
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
LOG("\nexample usage:\n");
LOG("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
LOG("\n");
}
struct Stats {
@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error");
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
@ -151,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
LOG("\n");
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
@ -174,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ABORT("fatal error");
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
@ -239,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
}
if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n");
LOG_INF("\n");
is_first = false;
}
if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
@ -258,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
}
if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
std::ofstream out(fname, std::ios::binary);
@ -290,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
LOGV(1, "\n");
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, fname);
LOG_ERR("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, fname);
LOG_ERR("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
@ -312,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
@ -323,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
m_stats = {};
return false;
}
@ -336,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {};
return false;
}
@ -437,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
return false;
}
@ -478,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -514,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
@ -531,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
LOG("%.2f minutes\n", total_seconds / 60.0);
}
if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
logits.clear();
}
}
printf("\n");
LOG("\n");
if (params.compute_ppl) {
nll2 /= count;
@ -562,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
LOG("Unexpected negative standard deviation of log(prob)\n");
}
}
@ -576,26 +572,27 @@ int main(int argc, char ** argv) {
params.n_ctx = 512;
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
return 1;
}
gpt_init();
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
}
if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
}
@ -614,20 +611,20 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
if (!compute_imatrix(ctx, params)) {
@ -636,7 +633,7 @@ int main(int argc, char ** argv) {
g_collector.save_imatrix();
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);

View file

@ -2,6 +2,7 @@
#include "common.h"
#include "console.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cassert>
@ -55,7 +56,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
@ -64,7 +65,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
@ -93,7 +94,7 @@ static void sigint_handler(int signo) {
is_interacting = true;
} else {
console::cleanup();
printf("\n");
LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
@ -110,56 +111,51 @@ int main(int argc, char ** argv) {
return 1;
}
auto & sparams = params.sparams;
gpt_init();
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("infill", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
auto & sparams = params.sparams;
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
LOG_ERR("\n************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
LOG_ERR("\n************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
printf("************\n\n");
LOG_ERR("\n************\n");
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
print_build_info();
LOG("%s: llama backend init\n", __func__);
LOG_INF("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
@ -172,34 +168,32 @@ int main(int argc, char ** argv) {
g_smpl = &smpl;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
LOG_DBG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(!llama_add_eos_token(model));
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
@ -224,18 +218,19 @@ int main(int argc, char ** argv) {
embd_inp.push_back(middle_token);
}
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_DBG("add_bos: %d\n", add_bos);
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
@ -244,9 +239,8 @@ int main(int argc, char ** argv) {
params.n_keep = (int)embd_inp.size();
}
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
@ -254,21 +248,21 @@ int main(int argc, char ** argv) {
}
if (params.verbose_prompt) {
LOG_TEE("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
LOG_INF("\n");
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_TEE("'\n");
LOG("'\n");
}
LOG_TEE("\n");
LOG_INF("\n");
}
if (params.interactive) {
@ -285,28 +279,30 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
LOG_TEE("%s: interactive mode on.\n", __func__);
LOG_INF("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n");
LOG_INF("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
smpl = gpt_sampler_init(model, sparams);
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_TEE("\n##### Infill mode #####\n\n");
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG("\n");
LOG("\n##### Infill mode #####\n\n");
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
@ -317,11 +313,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_TEE("== Running in interactive mode. ==\n");
LOG("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
LOG( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_TEE( "%s\n", control_message);
LOG( "%s\n", control_message);
is_interacting = params.interactive_first;
}
@ -354,9 +350,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size);
console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
fflush(stdout);
}
// infinite text generation via context swapping
@ -365,14 +360,14 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
@ -380,9 +375,9 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
LOG("after swap: n_past = %d\n", n_past);
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
}
@ -394,16 +389,16 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG("n_past = %d\n", n_past);
LOG_DBG("n_past = %d\n", n_past);
}
}
@ -415,7 +410,7 @@ int main(int argc, char ** argv) {
gpt_sampler_accept(smpl, id, true);
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
@ -425,10 +420,10 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget
--n_remain;
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
@ -447,7 +442,7 @@ int main(int argc, char ** argv) {
if (input_echo) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
if (embd.size() > 1) {
input_tokens.push_back(id);
@ -456,7 +451,6 @@ int main(int argc, char ** argv) {
output_ss << token_str;
}
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
@ -469,10 +463,9 @@ int main(int argc, char ** argv) {
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
fflush(stdout);
printf("\n");
LOG("\n");
console::set_display(console::user_input);
std::string buffer;
std::string line;
@ -528,35 +521,33 @@ int main(int argc, char ** argv) {
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
// LOG_TEE("took new input\n");
is_interacting = false;
}
// deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG("found EOS token\n");
LOG_DBG("found EOS token\n");
if (params.interactive) {
is_interacting = true;
printf("\n");
LOG("\n");
console::set_display(console::user_input);
fflush(stdout);
}
}
if (n_past > 0 && is_interacting && !params.interactive) {
LOG("waiting for user input\n");
LOG_DBG("waiting for user input\n");
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
LOG("%s", buffer.c_str());
}
std::string line;
@ -574,17 +565,17 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
LOG("%s", params.input_suffix.c_str());
}
LOG("buffer: '%s'\n", buffer.c_str());
LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@ -595,9 +586,9 @@ int main(int argc, char ** argv) {
}
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
@ -624,11 +615,10 @@ int main(int argc, char ** argv) {
}
}
if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout);
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
LOG_TEE("\n");
LOG("\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
@ -638,9 +628,5 @@ int main(int argc, char ** argv) {
gpt_sampler_free(smpl);
llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0;
}

View file

@ -3,7 +3,6 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
@ -40,6 +39,11 @@
#include <cinttypes>
#include <limits>
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@ -165,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key);
if (i == -1) {
LOG_TEE("key %s not found in file\n", key);
LOG_ERR("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key));
}
@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
}
@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return;
}
@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return;
}
@ -568,7 +572,7 @@ struct clip_ctx {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n");
LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr;
}
@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (load_image_size == nullptr) {
load_image_size = clip_image_size_init();
}
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
image_size_width = load_image_size->width;
image_size_height = load_image_size->height;
if (is_inf) {
@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
}
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
LOG_TEE("\n");
LOG_INF("%s: description: %s\n", __func__, description.c_str());
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
LOG_INF("\n");
}
const int n_tensors = gguf_get_n_tensors(ctx);
// kv
const int n_kv = gguf_get_n_kv(ctx);
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname);
{
std::map<enum ggml_type, uint32_t> n_type;
@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
n_type[type]++;
}
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i);
@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
replace_all(value, "\n", "\\n");
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}
// print type counts
@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
continue;
}
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
}
}
@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size;
if (verbosity >= 3) {
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
}
}
@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
#ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0);
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
#endif
#ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init();
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
LOG_INF("%s: CLIP using Metal backend\n", __func__);
#endif
#ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0);
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
LOG_INF("%s: CLIP using CANN backend\n", __func__);
#endif
#ifdef GGML_USE_VULKAN
new_clip->backend = ggml_backend_vk_init(0);
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
// model size and capabilities
@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
if (verbosity >= 1) {
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
}
}
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors
{
@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) {
LOG_TEE("%s: ggml_init() failed\n", __func__);
LOG_ERR("%s: ggml_init() failed\n", __func__);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
LOG_TEE("cannot open model file for loading tensors\n");
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
if (!fin) {
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
if (verbosity >= 2) {
LOG_TEE("\n%s: vision model hparams\n", __func__);
LOG_TEE("image_size %d\n", hparams.image_size);
LOG_TEE("patch_size %d\n", hparams.patch_size);
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
LOG_TEE("v_n_head %d\n", hparams.n_head);
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
LOG_TEE("v_eps %f\n", hparams.eps);
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
LOG_TEE("v_image_grid_pinpoints: ");
LOG_INF("\n%s: vision model hparams\n", __func__);
LOG_INF("image_size %d\n", hparams.image_size);
LOG_INF("patch_size %d\n", hparams.patch_size);
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
LOG_INF("v_n_head %d\n", hparams.n_head);
LOG_INF("v_n_layer %d\n", hparams.n_layer);
LOG_INF("v_eps %f\n", hparams.eps);
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
LOG_INF("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
}
LOG_TEE("\n");
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
LOG_INF("\n");
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
}
@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
}
// LLaVA projection
@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} catch (std::runtime_error & /*e*/) { }
try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & /*e*/) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
}
return new_clip;
@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
LOG_TEE("%s: failed to decode image bytes\n", __func__);
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
@ -1754,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
@ -1872,7 +1876,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images;
LOG_TEE("%s: multiple %d\n", __func__, multiple);
LOG_INF("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>());
if (multiple <= 1) {
@ -1887,17 +1891,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
clip_image_u8 * source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image);
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 * refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image->nx;
@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
int idx = 0;
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
res_imgs->data[idx++] = *res;
@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n");
LOG_ERR("This gguf file seems to have no vision encoder\n");
return false;
}
auto & params = ctx->vision_model.hparams;
@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
}
for (size_t i = 0; i < patches.size(); i++) {
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]);
}
@ -2279,7 +2283,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n");
LOG_ERR("This gguf file seems to have no vision encoder\n");
return false;
}
@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n");
LOG_ERR("This gguf file seems to have no vision encoder\n");
return false;
}
@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
}
const size_t n_elms = ggml_nelements(cur);
float * f32_data;
@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
f32_data = (float *)conv_buf.data();
break;
default:
LOG_TEE("Please use an input file in f32 or f16\n");
LOG_ERR("Please use an input file in f32 or f16\n");
gguf_free(ctx_out);
return false;
}
@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
fout.put(0);
}
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
}
@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
gguf_free(ctx_out);
{
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
}
return true;

View file

@ -10,6 +10,7 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
@ -75,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
LOG_TEE("%s: could not load image from base64 string.\n", __func__);
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
@ -114,9 +115,9 @@ struct llava_context {
};
static void print_usage(int, char ** argv) {
LOG_TEE("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// generate the response
LOG_TEE("\n");
LOG("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
if (!smpl) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
}
gpt_sampler_free(smpl);
printf("\n");
LOG("\n");
}
static struct llama_model * llava_init(gpt_params * params) {
@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free();
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) {
ggml_time_init();
@ -283,27 +278,23 @@ int main(int argc, char ** argv) {
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
gpt_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
auto model = llava_init(&params);
auto * model = llava_init(&params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
auto ctx_llava = llava_init_context(&params, model);
auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, "");
auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
@ -314,11 +305,11 @@ int main(int argc, char ** argv) {
llava_free(ctx_llava);
} else {
for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model);
auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image);
auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}

View file

@ -1,13 +1,23 @@
#include "clip.h"
#include "common.h"
#include "llama.h"
#include "llava.h"
#include "base64.hpp"
#include "llama.h"
#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
#include <numeric>
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
// RGB uint8 image
struct clip_image_u8 {
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
LOG_TEE("%s: unable to preprocess image\n", __func__);
LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
}
@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
LOG_TEE("Unable to encode image\n");
LOG_ERR("Unable to encode image\n");
return false;
}
@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false;
}
return true;
@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
}
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n");
LOG_ERR("Unable to allocate memory for image embeddings\n");
return false;
}
int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
LOG_TEE("%s: coulnd't embed the image\n", __func__);
LOG_ERR("%s: coulnd't embed the image\n", __func__);
return NULL;
}
@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
if (file == NULL) {
LOG_TEE("%s: can't read file %s\n", __func__, path);
LOG_ERR("%s: can't read file %s\n", __func__, path);
return false;
}
@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
return NULL;
}

View file

@ -7,9 +7,12 @@
#include "llama.h"
#include "ggml.h"
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <iostream> // TODO: remove me
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
@ -18,14 +21,8 @@ struct llava_context {
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llama_model * llava_init(gpt_params * params) {
@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048;
} else {
ctx_params.n_ctx = params->n_ctx;
@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model;
@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
@ -125,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
LOG_INF("%s: image token past: %d\n", __func__, n_past);
}
static const char * sample(struct gpt_sampler * smpl,
@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl,
}
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
auto * ctx_clip = clip_init_context(params);
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL;
}
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on");
LOG_ERR("prompt should be given or interactive mode should be on");
return NULL;
}
auto model = llava_init(params);
auto * model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto ctx_llava = llava_init_context(params, model);
auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
return ctx_llava;
}
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
// generate the response
LOG_TEE("\n");
LOG_INF("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
return smpl;
@ -259,12 +256,7 @@ int main(int argc, char ** argv) {
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
gpt_init();
if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
@ -273,21 +265,23 @@ int main(int argc, char ** argv) {
for (auto & image : params.image) {
int n_past = 0;
auto ctx_llava = minicpmv_init(&params, image, n_past);
auto * ctx_llava = minicpmv_init(&params, image, n_past);
if (!params.prompt.empty()) {
LOG_TEE("<user>%s\n", params.prompt.c_str());
LOG_TEE("<assistant>");
auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
LOG("<user>%s\n", params.prompt.c_str());
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response = "";
std::string response;
bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) {
auto tmp = llama_loop(ctx_llava, smpl, n_past);
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0){
if(!have_tmp)continue;
else break;
if (!have_tmp) {
continue;
}
break;
}
if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true;
@ -299,15 +293,15 @@ int main(int argc, char ** argv) {
gpt_sampler_free(smpl);
}else {
while (true) {
LOG_TEE("<user>");
LOG("<user>");
std::string prompt;
std::getline(std::cin, prompt);
LOG_TEE("<assistant>");
auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response = "";
std::string response;
for (int i = 0; i < max_tgt_len; i++) {
auto tmp = llama_loop(ctx_llava, smpl, n_past);
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior

View file

@ -1,6 +1,7 @@
#include "arg.h"
#include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cstdio>
@ -42,18 +43,14 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
const int W = 15; // lookahead window
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookahead", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
LOG("\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
@ -166,7 +163,7 @@ int main(int argc, char ** argv) {
{
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
fflush(stdout);
}
}
@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
}
if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
return 1;
}
@ -293,10 +290,10 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id);
if (v == 0) {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
} else {
// print light cyan
printf("\033[0;96m%s\033[0m", token_str.c_str());
LOG("\033[0;96m%s\033[0m", token_str.c_str());
}
fflush(stdout);
@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
// print known n-grams starting with token id (debug)
if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) {
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
}
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
printf(" - ngram %2d: ", i);
LOG(" - ngram %2d: ", i);
const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
printf("\n");
LOG("\n");
}
}
@ -455,20 +452,20 @@ int main(int argc, char ** argv) {
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n");
LOG_TEE("W = %2d\n", W);
LOG_TEE("N = %2d\n", N);
LOG_TEE("G = %2d\n", G);
LOG_TEE("\n");
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_INF("\n");
LOG_INF("W = %2d\n", W);
LOG_INF("N = %2d\n", N);
LOG_INF("G = %2d\n", G);
LOG_INF("\n");
LOG_INF("n_predict = %d\n", n_predict);
LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("\n");
LOG_INF("\n");
gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);
@ -482,7 +479,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -5,13 +5,12 @@
#include "llama.h"
#include "ggml.h"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cinttypes>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
int main(int argc, char ** argv){
gpt_params params;
@ -20,6 +19,8 @@ int main(int argc, char ** argv){
return 1;
}
gpt_init();
const int n_draft = params.n_draft;
// init llama.cpp
@ -49,7 +50,7 @@ int main(int argc, char ** argv){
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
@ -128,7 +129,7 @@ int main(int argc, char ** argv){
const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
}
// After each chunk, update the dynamic ngram cache with the context ngram cache:
@ -136,24 +137,24 @@ int main(int argc, char ** argv){
ngram_cache_context.clear();
}
LOG_TEE("\n");
LOG("\n");
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
LOG_INF("\n");
LOG_INF("n_draft = %d\n", n_draft);
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -3,6 +3,7 @@
#include "common.h"
#include "ngram-cache.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cstdint>
@ -18,17 +19,13 @@ int main(int argc, char ** argv){
return 1;
}
gpt_init();
// max. number of additional tokens to draft if match is found
const int n_draft = params.n_draft;
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookup", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
@ -58,7 +55,7 @@ int main(int argc, char ** argv){
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
@ -76,14 +73,14 @@ int main(int argc, char ** argv){
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
LOG("\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
@ -124,7 +121,7 @@ int main(int argc, char ** argv){
}
// print current draft sequence
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
int i_dft = 0;
while (true) {
@ -136,7 +133,7 @@ int main(int argc, char ** argv){
const std::string token_str = llama_token_to_piece(ctx, id);
if (!params.use_color) {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
if (llama_token_is_eog(model, id)) {
@ -147,7 +144,7 @@ int main(int argc, char ** argv){
// check if the target token matches the draft
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept;
++n_past;
++i_dft;
@ -161,19 +158,19 @@ int main(int argc, char ** argv){
if (params.use_color) {
// color accepted draft token
printf("\033[34m%s\033[0m", token_str.c_str());
LOG("\033[34m%s\033[0m", token_str.c_str());
fflush(stdout);
}
continue;
}
if (params.use_color) {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
draft.clear();
draft.push_back(id);
@ -224,22 +221,22 @@ int main(int argc, char ** argv){
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
LOG_TEE("\n\n");
LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
LOG_INF("\n");
LOG_INF("n_draft = %d\n", n_draft);
LOG_INF("n_predict = %d\n", n_predict);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ntarget:\n\n");
LOG_INF("\ntarget:\n\n");
gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);
@ -251,7 +248,7 @@ int main(int argc, char ** argv){
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -1,12 +1,11 @@
#include "arg.h"
#include "common.h"
#include "console.h"
#include "log.h"
#include "sampling.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
@ -42,11 +41,13 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool need_insert_eot = false;
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
printf("\n");
static void print_usage(int argc, char ** argv) {
(void) argc;
LOG("\nexample usage:\n");
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
LOG("\n");
}
static bool file_exists(const std::string & path) {
@ -74,8 +75,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
return;
}
@ -83,7 +83,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
@ -113,7 +113,7 @@ static void sigint_handler(int signo) {
need_insert_eot = true;
} else {
console::cleanup();
printf("\n");
LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
@ -122,17 +122,11 @@ static void sigint_handler(int signo) {
}
#endif
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
LOG("formatted: %s\n", formatted.c_str());
LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted;
}
@ -143,55 +137,46 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
auto & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
print_build_info();
LOG_INF("%s: llama backend init\n", __func__);
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
@ -206,21 +191,19 @@ int main(int argc, char ** argv) {
g_smpl = &smpl;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
LOG_ERR("%s: error: unable to load model\n", __func__);
return 1;
}
LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int) params.cpuparams.n_threads
);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
@ -232,8 +215,8 @@ int main(int argc, char ** argv) {
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1;
}
// Start the non-batch threadpool in the paused state
@ -242,55 +225,54 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1;
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print chat template example in conversation mode
if (params.conversation) {
if (params.enable_chat_template) {
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
} else {
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
}
}
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
}
std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
if (!file_exists(path_session)) {
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
LOG_INF("%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(path_session)) {
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else {
// The file exists and is not empty
session_tokens.resize(n_ctx);
size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
}
}
@ -298,7 +280,8 @@ int main(int argc, char ** argv) {
if (!llama_model_has_encoder(model)) {
GGML_ASSERT(!llama_add_eos_token(model));
}
LOG("add_bos: %d\n", add_bos);
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
std::vector<llama_token> embd_inp;
@ -307,31 +290,31 @@ int main(int argc, char ** argv) {
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
LOG_DBG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
} else {
LOG("use session tokens\n");
LOG_DBG("use session tokens\n");
embd_inp = session_tokens;
}
LOG("prompt: \"%s\"\n", log_tostr(prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
LOG_TEE("error: input is empty\n");
LOG_ERR("input is empty\n");
return -1;
}
}
// Tokenize negative prompt
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
@ -345,29 +328,28 @@ int main(int argc, char ** argv) {
n_matching_session_tokens++;
}
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
LOG_TEE("%s: using full prompt from session file\n", __func__);
LOG_INF("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
} else {
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
}
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
}
LOGLN(
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1);
}
@ -389,21 +371,20 @@ int main(int argc, char ** argv) {
}
if (params.verbose_prompt) {
LOG_TEE("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > add_bos) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_TEE("'\n");
LOG("'\n");
}
LOG_TEE("\n");
LOG_INF("\n");
}
// ctrl+C handling
@ -423,40 +404,40 @@ int main(int argc, char ** argv) {
}
if (params.interactive) {
LOG_TEE("%s: interactive mode on.\n", __func__);
LOG("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n");
LOG("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
@ -464,15 +445,15 @@ int main(int argc, char ** argv) {
smpl = gpt_sampler_init(model, sparams);
if (!smpl) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
return 1;
}
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@ -486,9 +467,9 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
}
LOG_TEE("\n\n");
LOG("\n");
if (params.interactive) {
const char * control_message;
@ -500,11 +481,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_TEE("== Running in interactive mode. ==\n");
LOG("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
LOG( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_TEE( "%s\n", control_message);
LOG( "%s\n", control_message);
is_interacting = params.interactive_first;
}
@ -543,7 +524,7 @@ int main(int argc, char ** argv) {
llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
@ -569,9 +550,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size);
console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
fflush(stdout);
}
if (ga_n == 1) {
@ -581,14 +561,14 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() >= n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
@ -596,11 +576,11 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
LOG("after swap: n_past = %d\n", n_past);
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
LOG("clear session path\n");
LOG_DBG("clear session path\n");
path_session.clear();
}
} else {
@ -610,10 +590,10 @@ int main(int argc, char ** argv) {
const int bd = (ga_w/ga_n)*(ga_n - 1);
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
LOG("\n");
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
LOG_DBG("\n");
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
@ -623,7 +603,7 @@ int main(int argc, char ** argv) {
ga_i += ga_w/ga_n;
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
}
}
@ -655,19 +635,19 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG("n_past = %d\n", n_past);
LOG_DBG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
}
}
@ -685,14 +665,14 @@ int main(int argc, char ** argv) {
need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG("saved session to %s\n", path_session.c_str());
LOG_DBG("saved session to %s\n", path_session.c_str());
}
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
@ -702,16 +682,16 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget
--n_remain;
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
@ -726,7 +706,7 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
// Console/Stream Output
fprintf(stdout, "%s", token_str.c_str());
LOG("%s", token_str.c_str());
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
@ -738,8 +718,6 @@ int main(int argc, char ** argv) {
output_tokens.push_back(id);
output_ss << token_str;
}
fflush(stdout);
}
}
@ -788,13 +766,13 @@ int main(int argc, char ** argv) {
}
if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str());
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
}
}
// deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG("found an EOG token\n");
LOG_DBG("found an EOG token\n");
if (params.interactive) {
if (!params.antiprompt.empty()) {
@ -808,7 +786,7 @@ int main(int argc, char ** argv) {
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
}
is_interacting = true;
printf("\n");
LOG("\n");
}
}
@ -819,21 +797,21 @@ int main(int argc, char ** argv) {
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
LOG_DBG("waiting for user input\n");
if (params.conversation) {
printf("\n> ");
LOG("\n> ");
}
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty() && !params.conversation) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str());
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
LOG("%s", params.input_prefix.c_str());
}
// color user input only
@ -856,11 +834,11 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty() && !params.conversation) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
printf("%s", params.input_suffix.c_str());
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
LOG("%s", params.input_suffix.c_str());
}
LOG("buffer: '%s'\n", buffer.c_str());
LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
@ -877,7 +855,7 @@ int main(int argc, char ** argv) {
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
// if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) {
@ -900,9 +878,9 @@ int main(int argc, char ** argv) {
assistant_ss.str("");
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
@ -918,7 +896,7 @@ int main(int argc, char ** argv) {
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n");
LOG(" [end of text]\n");
break;
}
@ -931,11 +909,11 @@ int main(int argc, char ** argv) {
}
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
LOG_TEE("\n");
LOG("\n\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
@ -949,9 +927,5 @@ int main(int argc, char ** argv) {
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0;
}

View file

@ -4,6 +4,7 @@
#include "arg.h"
#include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cmath>
@ -83,7 +84,9 @@ static void print_date_time() {
char buffer[80];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
LOG_INF("\n");
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
LOG_INF("\n");
}
// Define a split string function to ...
@ -106,6 +109,8 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
@ -120,12 +125,6 @@ int main(int argc, char ** argv) {
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("parallel", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
@ -138,23 +137,22 @@ int main(int argc, char ** argv) {
// load the prompts from an external file if there are any
if (params.prompt.empty()) {
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
} else {
// Output each line of the input params.prompts vector and copy to k_prompts
int index = 0;
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
std::vector<std::string> prompts = split_string(params.prompt, '\n');
for (const auto& prompt : prompts) {
k_prompts.resize(index + 1);
k_prompts[index] = prompt;
index++;
printf("%3d prompt: %s\n", index, prompt.c_str());
LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
}
}
fprintf(stderr, "\n\n");
fflush(stderr);
LOG_INF("\n\n");
const int n_ctx = llama_n_ctx(ctx);
@ -183,19 +181,19 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_TEE("\n");
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_INF("\n");
{
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
for (int32_t i = 0; i < n_tokens_system; ++i) {
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
}
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -204,10 +202,10 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("\n");
LOG_INF("\n");
}
LOG_TEE("Processing requests ...\n\n");
LOG_INF("Processing requests ...\n\n");
while (true) {
if (dump_kv_cache) {
@ -238,7 +236,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("%s: clearing the KV cache\n", __func__);
LOG_INF("%s: clearing the KV cache\n", __func__);
}
// insert new sequences for decoding
@ -273,7 +271,7 @@ int main(int argc, char ** argv) {
client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1;
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
g_seq_id += 1;
@ -317,11 +315,11 @@ int main(int argc, char ** argv) {
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return 1;
}
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1;
@ -332,7 +330,7 @@ int main(int argc, char ** argv) {
continue;
}
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
for (auto & client : clients) {
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@ -377,7 +375,7 @@ int main(int argc, char ** argv) {
const auto t_main_end = ggml_time_us();
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
(t_main_end - client.t_start_prompt) / 1e6,
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@ -400,19 +398,19 @@ int main(int argc, char ** argv) {
print_date_time();
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
if (params.prompt_file.empty()) {
params.prompt_file = "used built-in defaults";
}
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Cache misses: %6d\n", n_cache_miss);
LOG_TEE("\n");
LOG_INF("\n");
// TODO: print sampling/grammar timings for all clients
llama_perf_context_print(ctx);
@ -424,7 +422,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -1,5 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cmath>
@ -8,9 +9,9 @@
#include <vector>
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\n");
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
@ -24,6 +25,8 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
int n_junk = params.n_junk;
int n_keep = params.n_keep;
int n_grp = params.grp_attn_n;
@ -63,7 +66,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: unable to load model\n" , __func__);
return 1;
}
@ -77,7 +80,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return 1;
}
@ -107,14 +110,14 @@ int main(int argc, char ** argv) {
const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
// print the prompt token-by-token
LOG_TEE("\n");
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
LOG_INF("\n");
LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
LOG_INF("prompt tokens: %d\n", n_tokens_all);
//LOG_INF("prompt: %s\n", params.prompt.c_str());
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
@ -145,11 +148,11 @@ int main(int argc, char ** argv) {
}
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_INF("%s: llama_decode() failed\n", __func__);
return 1;
}
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
if (i + n_batch >= n_tokens_all) {
break;
@ -159,7 +162,7 @@ int main(int argc, char ** argv) {
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
const int n_discard = n_batch;
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
@ -179,18 +182,18 @@ int main(int argc, char ** argv) {
}
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
}
{
const int n_discard = n_past - n_ctx + n_predict;
if (n_discard > 0) {
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
@ -201,17 +204,16 @@ int main(int argc, char ** argv) {
}
}
LOG_TEE("\n");
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
LOG_TEE("\n");
LOG_INF("\n");
LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
LOG_INF("\n");
// main loop
int n_cur = n_tokens_all;
int n_decode = 0;
LOG_TEE("%s", prompt_suffix.c_str());
fflush(stdout);
LOG_INF("%s", prompt_suffix.c_str());
const auto t_main_start = ggml_time_us();
@ -222,13 +224,12 @@ int main(int argc, char ** argv) {
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
LOG_TEE("\n");
LOG("\n");
break;
}
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
n_decode += 1;
@ -243,22 +244,22 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
LOG("\n");
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
fprintf(stderr, "\n");
LOG("\n");
llama_sampler_free(smpl);

View file

@ -1,7 +1,9 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <array>
#include <atomic>
#include <cmath>
@ -41,7 +43,7 @@ static void write_logfile(
}
if (params.hellaswag) {
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
return;
}
@ -49,7 +51,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
@ -58,7 +60,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
@ -344,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
@ -364,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
prob_history.resize(tokens.size());
if (params.ppl_stride <= 0) {
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
const int calc_chunk = n_ctx;
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), n_ctx, params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
@ -387,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
int count = 0;
double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride;
const int end = start + calc_chunk;
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
//LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
std::vector<float> logits;
@ -407,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
//fprintf(stderr, "%s : failed to eval\n", __func__);
//LOG_ERR("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
@ -434,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
LOG("%.2f minutes\n", total_seconds / 60.0);
}
LOG("\n");
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
@ -460,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
}
fflush(stdout);
}
printf("\n");
LOG("\n");
return {tokens, std::exp(nll / count), logit_history, prob_history};
}
@ -488,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
@ -540,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
logits.reserve((size_t)n_ctx * n_vocab);
}
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -613,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
}
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_INF("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
@ -628,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total*n_chunk/n_seq);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
LOG("%.2f minutes\n", total_seconds / 60.0);
}
LOG("\n");
for (int seq = 0; seq < n_seq_batch; seq++) {
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
@ -656,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
} else {
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
}
}
fflush(stdout);
logits.clear();
}
printf("\n");
LOG("\n");
nll2 /= count;
nll /= count;
@ -676,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
}
llama_batch_free(batch);
@ -704,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
@ -790,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
if (prompt_lines.size() % 6 != 0) {
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
return;
}
size_t hs_task_count = prompt_lines.size()/6;
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
LOG_INF("================================= is_spm = %d\n", is_spm);
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
@ -825,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<llama_token> seq_tokens[4];
};
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
// Select and read data from prompt lines
std::vector<hs_data_t> hs_data(hs_task_count);
@ -871,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
}
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
printf("\ntask\tacc_norm\n");
LOG("\ntask\tacc_norm\n");
double acc = 0.0f;
@ -941,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
@ -949,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
@ -999,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
}
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
//LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
// If the gold ending got the maximum logprobe add one accuracy point
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@ -1007,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
// Print the accumulated accuracy mean x 100
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
fflush(stdout);
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
}
i0 = i1 - 1;
@ -1016,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
llama_batch_free(batch);
printf("\n");
LOG("\n");
}
struct winogrande_entry {
@ -1060,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
}
}
if (ipos != 4) {
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
continue;
}
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@ -1074,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
if (sentence[where] == '_') break;
}
if (where == int(sentence.size())) {
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
continue;
}
std::istringstream stream(answer.c_str());
int i_answer; stream >> i_answer;
if (stream.fail() || i_answer < 1 || i_answer > 2) {
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
continue;
}
result.emplace_back();
@ -1109,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
auto data = load_winogrande_from_csv(params.prompt);
if (data.empty()) {
fprintf(stderr, "%s: no tasks\n", __func__);
LOG_ERR("%s: no tasks\n", __func__);
return;
}
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
std::mt19937 rng(1);
std::vector<int> aux(data.size());
for (int i = 0; i < int(data.size()); ++i) {
@ -1134,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
data = std::move(selected);
}
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
LOG_INF("%s : tokenizing selected tasks\n", __func__);
for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@ -1157,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
}
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
@ -1218,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
@ -1226,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
@ -1286,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
++n_done;
// print the accumulated accuracy mean x 100
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
fflush(stdout);
LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
}
i0 = i1 - 1;
}
printf("\n");
LOG("\n");
if (n_done < 100) return;
const float p = 1.f*n_correct/n_done;
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
}
static bool deserialize_string(std::istream & in, std::string & str) {
@ -1348,7 +1349,7 @@ struct multiple_choice_task {
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__);
LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
}
return false;
}
@ -1356,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
for (auto& answer : task.mc1.answers) {
if (answer.empty()) {
if (log_error) {
printf("%s: found empty answer\n", __func__);
LOG_ERR("%s: found empty answer\n", __func__);
}
return false;
}
@ -1410,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) {
printf("%s: no tasks\n", __func__);
LOG_ERR("%s: no tasks\n", __func__);
return;
}
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
printf("%s: failed to read task positions from prompt\n", __func__);
LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
return;
}
@ -1425,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks
tasks.resize(n_task);
printf("%s: reading tasks", __func__);
LOG_INF("%s: reading tasks", __func__);
int n_dot = std::max((int) n_task/100, 1);
int i = 0;
for (auto& task : tasks) {
++i;
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
return;
}
if (i%n_dot == 0) printf(".");
if (i%n_dot == 0) LOG(".");
}
printf("done\n");
LOG("done\n");
}
else {
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1);
std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@ -1452,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return;
}
}
n_task = params.multiple_choice_tasks;
}
printf("%s: preparing task data", __func__);
fflush(stdout);
LOG_INF("%s: preparing task data", __func__);
if (n_task > 500) {
printf("...");
fflush(stdout);
LOG("...");
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@ -1487,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
for (auto& w : workers) w = std::thread(prepare);
prepare();
for (auto& w : workers) w.join();
printf("done\n");
fflush(stdout);
LOG("done\n");
int nbad = n_bad;
if (nbad > 0) {
printf("%s: found %d malformed tasks\n", __func__, nbad);
LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
return;
}
} else {
@ -1503,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
return;
}
if (i_task%n_dot == 0) {
printf(".");
fflush(stdout);
LOG(".");
}
}
printf("done\n");
LOG("done\n");
}
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
printf("\ntask\tacc_norm\n");
LOG("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
@ -1591,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
@ -1599,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
@ -1623,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i];
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) {
// printf("%d", j+1);
// LOG("%d", j+1);
// }
//}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
// get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@ -1641,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
//printf(" %zu %g\n", ir, eval_results[ir]);
//LOG(" %zu %g\n", ir, eval_results[ir]);
++count;
log_prob += eval_results[ir++];
}
cur_task.log_probs[s] = log_prob / count;
//printf(" Final: %g\n", log_prob / count);
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
//LOG(" Final: %g\n", log_prob / count);
//LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
}
// Find the ending with maximum logprob
@ -1667,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
++n_done;
// Print the accumulated accuracy mean x 100
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
fflush(stdout);
LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
}
i0 = i1 - 1;
@ -1680,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1));
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
LOG("\n");
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1));
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
printf("\n");
LOG_INF("\n");
}
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (params.logits_file.empty()) {
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return;
}
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) {
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
return;
}
{
char check[9]; check[8] = 0;
in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return;
}
}
@ -1710,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
@ -1718,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) {
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return;
}
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
@ -1776,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
@ -1797,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return;
}
@ -1814,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
LOG("%.2f minutes\n", total_seconds / 60.0);
}
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@ -1832,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first;
printf("%4d", i+1);
LOG("%4d", i+1);
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
printf("\n");
fflush(stdout);
LOG("\n");
logits.clear();
}
printf("\n");
LOG("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end());
std::sort(p_diff_values.begin(), p_diff_values.end());
printf("====== Perplexity statistics ======\n");
LOG("====== Perplexity statistics ======\n");
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double ppl_base_val = exp(log_ppl_base.first);
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
// LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
const double ppl_ratio_val = exp(log_ppl_ratio_val);
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
const double ppl_diff_val = ppl_val - ppl_base_val;
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
printf("\n");
LOG("\n");
printf("====== KL divergence statistics ======\n");
LOG("====== KL divergence statistics ======\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2];
@ -1916,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
};
printf("Maximum KLD: %10.6f\n", kld_values.back());
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("Median KLD: %10.6f\n", kld_median);
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
printf("Minimum KLD: %10.6f\n", kld_values.front());
LOG("Maximum KLD: %10.6f\n", kld_values.back());
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
LOG("Median KLD: %10.6f\n", kld_median);
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
LOG("Minimum KLD: %10.6f\n", kld_values.front());
printf("\n");
LOG("\n");
printf("====== Token probability statistics ======\n");
LOG("====== Token probability statistics ======\n");
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
: p_diff_values[p_diff_values.size()/2];
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
// LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
const double same_top_p = 1.0*kld.n_same_top/kld.count;
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
}
int main(int argc, char ** argv) {
@ -1972,10 +1966,12 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
return 1;
}
@ -2000,13 +1996,11 @@ int main(int argc, char ** argv) {
}
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2;
}
print_build_info();
llama_backend_init();
llama_numa_init(params.numa);
@ -2016,21 +2010,21 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
struct results_perplexity results;
@ -2046,8 +2040,9 @@ int main(int argc, char ** argv) {
results = perplexity(ctx, params, n_ctx);
}
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
write_logfile(ctx, params, model, results);
llama_free(ctx);

View file

@ -1,14 +1,16 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <fstream>
#include <iostream> // TODO: remove me
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
LOG_TEE("\n");
LOG("\nexample usage:\n");
LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
LOG("\n");
}
struct chunk {
@ -17,7 +19,7 @@ struct chunk {
// original file position
size_t filepos;
// original text data
std::string textdata = "";
std::string textdata;
// tokenized text data
std::vector<llama_token> tokens;
// embedding
@ -31,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
std::ifstream f(filename.c_str());
if (!f.is_open()) {
fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
LOG_ERR("could not open file %s\n", filename.c_str());
return chunks;
}
chunk current_chunk;
char buffer[1024];
int64_t filepos = 0;
std::string current = "";
std::string current;
while (f.read(buffer, 1024)) {
current += std::string(buffer, f.gcount());
size_t pos;
@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__);
LOG_ERR("%s : failed to decode\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
@ -116,24 +118,24 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
// For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
params.embedding = true;
if (params.chunk_size <= 0) {
fprintf(stderr, "chunk_size must be positive\n");
LOG_ERR("chunk_size must be positive\n");
return 1;
}
if (params.context_files.empty()) {
fprintf(stderr, "context_files must be specified\n");
LOG_ERR("context_files must be specified\n");
return 1;
}
print_build_info();
printf("processing files:\n");
LOG_INF("processing files:\n");
for (auto & context_file : params.context_files) {
printf("%s\n", context_file.c_str());
LOG_INF("%s\n", context_file.c_str());
}
std::vector<chunk> chunks;
@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
}
printf("Number of chunks: %ld\n", chunks.size());
LOG_INF("Number of chunks: %ld\n", chunks.size());
llama_backend_init();
llama_numa_init(params.numa);
@ -153,7 +155,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_init.context;
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
@ -162,19 +164,19 @@ int main(int argc, char ** argv) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
LOG_ERR("%s: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
// max batch size
@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
for (auto & chunk : chunks) {
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
}
@ -199,12 +201,12 @@ int main(int argc, char ** argv) {
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) chunks.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
}
fprintf(stderr, "\n\n");
LOG_INF("\n\n");
}
}
@ -256,7 +258,7 @@ int main(int argc, char ** argv) {
// start loop, receive query and return top k similar chunks based on cosine similarity
std::string query;
while (true) {
printf("Enter query: ");
LOG("Enter query: ");
std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
@ -280,18 +282,18 @@ int main(int argc, char ** argv) {
return a.second > b.second;
});
printf("Top %d similar chunks:\n", params.sparams.top_k);
LOG("Top %d similar chunks:\n", params.sparams.top_k);
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
printf("similarity: %f\n", similarities[i].second);
printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
printf("--------------------\n");
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
LOG("similarity: %f\n", similarities[i].second);
LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
LOG("--------------------\n");
}
}
}
LOG_TEE("\n");
LOG("\n");
llama_perf_context_print(ctx);
// clean up

View file

@ -1,6 +1,6 @@
set(TARGET llama-server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
@ -46,9 +46,6 @@ endforeach()
add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})

View file

@ -121,7 +121,6 @@ The project is under active development, and we are [looking for feedback and co
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
| `--log-format {text, json}` | log output format: json or text (default: json) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |

View file

@ -40,7 +40,6 @@ server --host localhost --port 8080 \
--parallel 8 \
--batch-size 512 \
--ctx-size 4096 \
--log-format text \
-ngl 33
```

View file

@ -272,7 +272,6 @@ def start_server_background(args):
server_args.append('--cont-batching')
server_args.append('--metrics')
server_args.append('--flash-attn')
server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
pkwargs = {

File diff suppressed because it is too large Load diff

1
examples/server/tests/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
.venv

View file

@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables:
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
### Run @bug, @wip or @wrong_usage annotated scenario

View file

@ -1372,8 +1372,6 @@ def start_server_background(context):
server_args.append('--verbose')
if context.lora_file:
server_args.extend(['--lora', context.lora_file])
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [context.server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")

View file

@ -1,7 +1,8 @@
#pragma once
#include "llama.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
@ -15,10 +16,10 @@
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include <random>
#include <sstream>
#include <string>
#include <vector>
#include <sstream>
#include <random>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
@ -35,32 +36,6 @@ enum error_type {
ERROR_TYPE_NOT_SUPPORTED, // custom error
};
extern bool server_verbose;
extern bool server_log_json;
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
#if SERVER_VERBOSE != 1
#define LOG_VERBOSE(MSG, ...)
#else
#define LOG_VERBOSE(MSG, ...) \
do \
{ \
if (server_verbose) \
{ \
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
} \
} while (0)
#endif
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value
@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
try {
return body.at(key);
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
std::stringstream ss;
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
LOG_WARNING(ss.str().c_str(), body);
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
return default_value;
}
} else {
@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
}
}
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = json{
{"tid", ss_tid.str()},
{"timestamp", time(nullptr)},
};
if (server_log_json) {
log.merge_patch({
{"level", level},
{"function", function},
{"line", line},
{"msg", message},
});
if (!extra.empty()) {
log.merge_patch(extra);
}
printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
} else {
char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
if (!extra.empty()) {
log.merge_patch(extra);
}
std::stringstream ss;
ss << buf << " |";
for (const auto & el : log.items())
{
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
ss << " " << el.key() << "=" << value;
}
const std::string str = ss.str();
printf("%.*s\n", (int)str.size(), str.data());
}
fflush(stdout);
}
//
// chat template utils
//
@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
chat.push_back({role, content});
}
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
return formatted_chat;
}
@ -243,10 +175,7 @@ static std::string random_string() {
}
static std::string gen_chatcmplid() {
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
return "chatcmpl-" + random_string();
}
//
@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
return std::string::npos;
}
static bool json_is_array_of_numbers(json data) {
static bool json_is_array_of_numbers(const json & data) {
if (data.is_array()) {
for (const auto & e : data) {
if (!e.is_number()) {
@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
return out;
}
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
const std::string str =
std::string(event) + ": " +
data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
LOG_VERBOSE("data stream", {
{ "to_send", str }
});
LOG_DBG("data stream, to_send: %s", str.c_str());
return sink.write(str.c_str(), str.size());
}
@ -425,7 +352,7 @@ static json oaicompat_completion_params_parse(
// Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
for (auto & param : unsupported_params) {
for (const auto & param : unsupported_params) {
if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param);
}
@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse(
return llama_params;
}
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
{"id", completion_id}
};
if (server_verbose) {
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = result;
}
@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
}
// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result});
}
@ -595,7 +523,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array();
int i = 0;
for (auto & elem : embeddings) {
for (const auto & elem : embeddings) {
data.push_back(json{
{"embedding", json_value(elem, "embedding", json::array())},
{"index", i++},

View file

@ -1,16 +1,14 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
LOG_TEE("\n");
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
@ -23,6 +21,8 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
// total length of the sequence including the prompt
const int n_predict = params.n_predict;
@ -69,25 +69,24 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
LOG("\n");
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token
fprintf(stderr, "\n");
LOG("\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
// create a llama_batch with size 512
// we use this object to submit token data for decoding
@ -102,7 +101,7 @@ int main(int argc, char ** argv) {
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -116,16 +115,16 @@ int main(int argc, char ** argv) {
while (n_cur <= n_predict) {
// sample the next token
{
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
LOG_TEE("\n");
LOG("\n");
break;
}
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
// prepare the next batch
@ -141,23 +140,23 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
LOG("\n");
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n");
LOG("\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
fprintf(stderr, "\n");
LOG("\n");
llama_batch_free(batch);
llama_sampler_free(smpl);

View file

@ -1,13 +1,16 @@
#include "arg.h"
#include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <random>
#include <set>
#include <string>
#include <vector>
#include <set>
#include <random>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -33,8 +36,10 @@ int main(int argc, char ** argv) {
return 1;
}
gpt_init();
if (params.model_draft.empty()) {
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
@ -47,12 +52,6 @@ int main(int argc, char ** argv) {
std::default_random_engine rng(params.sparams.seed);
std::uniform_real_distribution<> u_dist;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
@ -81,14 +80,14 @@ int main(int argc, char ** argv) {
ctx_dft = llama_init_dft.context;
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(model_dft);
LOG("vocab_type dft: %d\n", vocab_type_dft);
LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
return 1;
}
@ -98,7 +97,7 @@ int main(int argc, char ** argv) {
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
) {
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
return 1;
}
@ -110,8 +109,8 @@ int main(int argc, char ** argv) {
: n_vocab_dft - n_vocab_tgt;
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return 1;
}
@ -120,8 +119,8 @@ int main(int argc, char ** argv) {
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
const char * token_text_dft = llama_token_get_text(model_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
llama_token_to_piece(ctx_tgt, i).c_str(),
llama_token_to_piece(ctx_dft, i).c_str());
return 1;
@ -138,18 +137,16 @@ int main(int argc, char ** argv) {
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
LOG("\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
}
fflush(stderr);
const int n_input = inp.size();
const auto t_enc_start = ggml_time_us();
@ -211,7 +208,7 @@ int main(int argc, char ** argv) {
active_seqs.insert(s);
const auto & tokens = drafts[s].tokens;
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
}
int i_dft = 0;
@ -254,7 +251,7 @@ int main(int argc, char ** argv) {
continue;
}
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
float r = u_dist(rng);
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
@ -272,7 +269,7 @@ int main(int argc, char ** argv) {
break;
}
}
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
if (r <= p_tgt / p_dft) {
s_keep = s;
accept = true;
@ -280,10 +277,10 @@ int main(int argc, char ** argv) {
token_str = llama_token_to_piece(ctx_tgt, token_id);
gpt_sampler_accept(smpl, token_id, true);
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
break;
} else {
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
drafts[s].active = false;
// calculate residual probability
@ -338,7 +335,7 @@ int main(int argc, char ** argv) {
if (!accept) {
// all drafted tokens were rejected
// sample from the target model
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
std::vector<float> probs(dist_tgt.size);
for (size_t i = 0; i < dist_tgt.size; ++i) {
probs[i] = dist_tgt.data[i].p;
@ -356,13 +353,11 @@ int main(int argc, char ** argv) {
// greedy verification
// sample from the target model
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
gpt_sampler_accept(smpl, token_id, true);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
token_str = llama_token_to_piece(ctx_tgt, token_id);
for (int s = 0; s < n_seq_dft; ++s) {
@ -371,7 +366,7 @@ int main(int argc, char ** argv) {
}
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
s_keep = s;
accept = true;
@ -393,26 +388,24 @@ int main(int argc, char ** argv) {
++i_dft;
if (params.use_color) {
// Color token according to its origin sequence
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
} else {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
fflush(stdout);
continue;
} else {
printf("%s", token_str.c_str());
fflush(stdout);
LOG("%s", token_str.c_str());
break;
}
}
}
{
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
// TODO: simplify
{
LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_kv_cache_seq_keep(ctx_dft, s_keep);
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
@ -439,7 +432,7 @@ int main(int argc, char ** argv) {
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode(ctx_dft, batch_dft);
++n_past_dft;
@ -486,7 +479,7 @@ int main(int argc, char ** argv) {
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
}
@ -495,7 +488,7 @@ int main(int argc, char ** argv) {
// attempt to split the branch if the probability is high enough
for (int f = 1; f < 8; ++f) {
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@ -584,7 +577,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
}
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
llama_decode(ctx_tgt, batch_tgt);
++n_past_tgt;
}
@ -602,23 +595,25 @@ int main(int argc, char ** argv) {
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("\n");
LOG_INF("n_draft = %d\n", n_draft);
LOG_INF("n_predict = %d\n", n_predict);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ndraft:\n\n");
LOG_INF("\n");
LOG_INF("draft:\n\n");
// TODO: print sampling/grammar timings for all drafts
llama_perf_context_print(ctx_dft);
LOG_TEE("\ntarget:\n\n");
LOG_INF("\n");
LOG_INF("target:\n\n");
gpt_perf_print(ctx_tgt, smpl);
gpt_sampler_free(smpl);
@ -637,7 +632,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -1,11 +1,13 @@
#include "common.h"
//#include "log.h" // TODO: start using log.h
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <string>
#include <vector>
#include <iostream> // TODO: remove me
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
@ -13,25 +15,25 @@
#include <shellapi.h> // For CommandLineToArgvW
#endif
static void print_usage_information(const char * argv0, FILE * stream) {
fprintf(stream, "usage: %s [options]\n\n", argv0);
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
fprintf(stream, " The possible options are:\n");
fprintf(stream, "\n");
fprintf(stream, " -h, --help print this help and exit\n");
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
fprintf(stream, " --show-count print the total number of tokens.\n");
static void print_usage_information(const char * argv0) {
printf("usage: %s [options]\n\n", argv0);
printf("The tokenize program tokenizes a prompt using a given model,\n");
printf("and prints the resulting tokens to standard output.\n\n");
printf("It needs a model file, a prompt, and optionally other flags\n");
printf("to control the behavior of the tokenizer.\n\n");
printf(" The possible options are:\n");
printf("\n");
printf(" -h, --help print this help and exit\n");
printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
printf(" --stdin read prompt from standard input.\n");
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
printf(" --no-parse-special do not parse control tokens.\n");
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
printf(" --show-count print the total number of tokens.\n");
}
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
const int argc = argv.size();
if (argc <= 1) {
print_usage_information(argv[0].c_str(), stderr);
print_usage_information(argv[0].c_str());
return 1;
}
@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
for (; iarg < argc; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-h" || arg == "--help") {
print_usage_information(argv[0].c_str(), stdout);
print_usage_information(argv[0].c_str());
return 0;
}
else if (arg == "--ids") {
@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
// Start actually doing the tokenizing stuff.
//////
#ifdef LOG_DISABLE_LOGS
disable_logging = true;
#endif
if (disable_logging) {
llama_log_set(llama_log_callback_null, NULL);
}