model : more uniform output id handling (#14275)

* model : more uniform output id handling

ggml-ci

* cont : revert n_outputs < n_tokens optimization

ggml-ci

* cont : fix out_ids initialization

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-06-20 10:50:27 +03:00 committed by GitHub
parent 4c9fdfbe15
commit 812939a9e9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 459 additions and 442 deletions

View file

@ -92,36 +92,28 @@ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
} }
void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { GGML_ASSERT(out_ids);
//GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
if (!out_ids) { const int64_t n_tokens = ubatch->n_tokens;
LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
} else {
const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
int32_t * data = (int32_t *) out_ids->data; int32_t * data = (int32_t *) out_ids->data;
if (n_outputs == n_tokens) { if (n_outputs == n_tokens) {
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
data[i] = i; data[i] = i;
} }
} else if (ubatch->output) {
int32_t n_outputs = 0; return;
for (int i = 0; i < n_tokens; ++i) { }
if (ubatch->output[i]) {
data[n_outputs++] = i; GGML_ASSERT(ubatch->output);
}
} int n_outputs = 0;
// the graph needs to have been passed the correct number of outputs
GGML_ASSERT(n_outputs == n_outputs); for (int i = 0; i < n_tokens; ++i) {
} else if (n_outputs == 1) { if (ubatch->output[i]) {
// only keep last output data[n_outputs++] = i;
data[0] = n_tokens - 1;
} else {
GGML_ASSERT(n_outputs == 0);
}
} }
} }
} }
@ -874,6 +866,14 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
} }
ggml_tensor * llm_graph_context::build_inp_out_ids() const { ggml_tensor * llm_graph_context::build_inp_out_ids() const {
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
// but this would make the graph topology depend on the number of output tokens, which can interere with
// features that require constant topology such as pipline parallelism
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
//if (n_outputs < n_tokens) {
// return nullptr;
//}
auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs); auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
auto & cur = inp->out_ids; auto & cur = inp->out_ids;

File diff suppressed because it is too large Load diff