From abd4d0bc4f1a9a0e429bc8ee0d5ece2a394a0a39 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Feb 2025 13:29:42 +0200 Subject: [PATCH] speculative : update default params (#11954) * speculative : update default params * speculative : do not discard the last drafted token --- common/common.h | 4 ++-- common/speculative.cpp | 10 +++++----- common/speculative.h | 2 +- examples/server/server.cpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/common/common.h b/common/common.h index 10bcc10d..efe8e7f7 100644 --- a/common/common.h +++ b/common/common.h @@ -178,10 +178,10 @@ struct common_params_speculative { int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.9f; // minimum speculative decoding probability (greedy) + float p_min = 0.75f; // minimum speculative decoding probability (greedy) struct cpu_params cpuparams; struct cpu_params cpuparams_batch; diff --git a/common/speculative.cpp b/common/speculative.cpp index 318e96ea..b1fff27a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft( // add drafted token for each sequence const llama_token id = cur_p->data[0].id; - // only collect very high-confidence draft tokens - if (cur_p->data[0].p < params.p_min) { - break; - } - common_sampler_accept(smpl, id, true); result.push_back(id); @@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft( break; } + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + break; + } + common_batch_add(batch, id, n_past + i + 1, { 0 }, true); // evaluate the drafted tokens on the draft model diff --git a/common/speculative.h b/common/speculative.h index 2baf99fc..2b51a70c 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -9,7 +9,7 @@ struct common_speculative_params { int n_draft = 16; // max drafted tokens int n_reuse = 256; - float p_min = 0.9f; // min probability required to accept a token in the draft + float p_min = 0.75f; // min probability required to accept a token in the draft }; struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 809bfe0e..2306dc26 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -274,7 +274,7 @@ struct server_task { params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); - params.speculative.n_min = std::max(params.speculative.n_min, 2); + params.speculative.n_min = std::max(params.speculative.n_min, 0); params.speculative.n_max = std::max(params.speculative.n_max, 0); // Use OpenAI API logprobs only if n_probs wasn't provided