model : more uniform output id handling (#14275)

* model : more uniform output id handling ggml-ci * cont : revert n_outputs < n_tokens optimization ggml-ci * cont : fix out_ids initialization ggml-ci
2025-06-20 10:50:27 +03:00 · 2025-06-20 10:50:27 +03:00 · 812939a9e9
commit 812939a9e9
parent 4c9fdfbe15
2 changed files with 459 additions and 442 deletions
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -92,36 +92,28 @@ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
 }
 void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+    GGML_ASSERT(out_ids);
        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
-        if (!out_ids) {
+    const int64_t n_tokens = ubatch->n_tokens;
            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
        } else {
            const int64_t n_tokens = ubatch->n_tokens;
-            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
+    GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
-            int32_t * data = (int32_t *) out_ids->data;
+    int32_t * data = (int32_t *) out_ids->data;
-            if (n_outputs == n_tokens) {
+    if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
+        for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
+            data[i] = i;
-                }
+        }
-            } else if (ubatch->output) {
+
-                int32_t n_outputs = 0;
+        return;
-                for (int i = 0; i < n_tokens; ++i) {
+    }
-                    if (ubatch->output[i]) {
+
-                        data[n_outputs++] = i;
+    GGML_ASSERT(ubatch->output);
-                    }
+
-                }
+    int n_outputs = 0;
-                // the graph needs to have been passed the correct number of outputs
+
-                GGML_ASSERT(n_outputs == n_outputs);
+    for (int i = 0; i < n_tokens; ++i) {
-            } else if (n_outputs == 1) {
+        if (ubatch->output[i]) {
-                // only keep last output
+            data[n_outputs++] = i;
                data[0] = n_tokens - 1;
            } else {
                GGML_ASSERT(n_outputs == 0);
            }
        }
    }
 }
@ -874,6 +866,14 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
 }
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
    // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
    //       but this would make the graph topology depend on the number of output tokens, which can interere with
    //       features that require constant topology such as pipline parallelism
    //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
    //if (n_outputs < n_tokens) {
    //    return nullptr;
    //}
    auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
    auto & cur = inp->out_ids;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp