llama : remove llama_kv_cache_view API + remove deprecated (#13653)
ggml-ci
This commit is contained in:
parent
b69f1647f9
commit
a4090d1174
10 changed files with 1 additions and 390 deletions
|
@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
|
|||
const int N = 5; // n-gram size
|
||||
const int G = 15; // max verification n-grams
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
|
|||
// here we keep adding new n-grams as we go
|
||||
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
// sample first token
|
||||
|
@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
while (true) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
//
|
||||
// Example for W = 5, N = 4, G = 2:
|
||||
|
@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
common_sampler_free(smpl);
|
||||
|
||||
llama_kv_cache_view_free(&kvc_view);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_backend_free();
|
||||
|
|
|
@ -24,8 +24,6 @@ int main(int argc, char ** argv){
|
|||
// max. number of additional tokens to draft if match is found
|
||||
const int n_draft = params.speculative.n_max;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
@ -110,18 +108,9 @@ int main(int argc, char ** argv){
|
|||
|
||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// print current draft sequence
|
||||
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
||||
|
||||
|
|
|
@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
|
|||
// insert new requests as soon as the previous one is done
|
||||
const bool cont_batching = params.cont_batching;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
// is the system prompt shared in the cache
|
||||
const bool is_sp_shared = params.is_pp_shared;
|
||||
|
||||
|
@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
|
|||
int32_t n_total_gen = 0;
|
||||
int32_t n_cache_miss = 0;
|
||||
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
|
@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
|
|||
LOG_INF("Processing requests ...\n\n");
|
||||
|
||||
while (true) {
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue