mtmd : support Qwen 2.5 Omni (input audio+vision, no audio output) (#13784)

* mtmd : allow multiple modalities at the same time * refactor mtmd tokenizer * fix compile * ok, missing SinusoidsPositionEmbedding * first working version * fix style * more strict validate of n_embd * refactor if..else to switch * fix regression * add test for 3B * update docs * fix tokenizing with add_special * add more tests * fix test case "huge" * rm redundant code * set_position_mrope_1d rm n_tokens
2025-05-27 14:06:10 +02:00 · 2025-05-27 14:06:10 +02:00 · bc583e3c63
commit bc583e3c63
parent 72b090da2c
12 changed files with 1148 additions and 744 deletions
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -66,7 +66,8 @@ struct decode_embd_batch {
        }
    }

-    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+    // M-RoPE for image
+    void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
        GGML_ASSERT(n_pos_per_embd == 4);
        seq_id_0[0] = seq_id;
        for (int y = 0; y < ny; y++) {
@ -85,6 +86,23 @@ struct decode_embd_batch {
        }
    }

+    // M-RoPE for audio
+    void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
+            pos[i                     ] = pos_0 + i;
+            pos[i + batch.n_tokens    ] = pos_0 + i;
+            pos[i + batch.n_tokens * 2] = pos_0 + i;
+            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
    llama_batch get_view(int offset, int n_tokens) {
        llama_pos * pos_ptr;
        pos_view.clear();
@ -146,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);

    if (mtmd_decode_use_mrope(ctx)) {
-        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-        if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
-            return -1;
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            if (!image_tokens) {
+                LOG_ERR("failed to decode chunk: image tokens are null\n");
+                return -1;
+            }
+            const int nx = mtmd_image_tokens_get_nx(image_tokens);
+            const int ny = mtmd_image_tokens_get_ny(image_tokens);
+            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            batch_embd.set_position_mrope_1d(n_past, seq_id);
+        } else {
+            GGML_ABORT("invalid chunk type for M-RoPE");
        }
-        if (!image_tokens) {
-            LOG_ERR("failed to decode chunk: image tokens are null\n");
-            return -1;
-        }
-        const int nx = mtmd_image_tokens_get_nx(image_tokens);
-        const int ny = mtmd_image_tokens_get_ny(image_tokens);
-        batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
    } else {
        batch_embd.set_position_normal(n_past, seq_id);
    }