mtmd : add qwen2vl and qwen2.5vl (#13141)

* llava : add clip_n_output_tokens, deprecate clip_n_patches * mtmd : add qwen2vl and qwen2.5vl * decode_embd_batch::set_position_... * working version * deprecate llama-qwen2vl-cli * correct order W, H of clip_embd_nbytes_by_img * edit existing line in hot topics
2025-04-29 11:47:04 +02:00 · 2025-04-29 11:47:04 +02:00 · 00e3e5a194
commit 00e3e5a194
parent e98b3692be
10 changed files with 196 additions and 79 deletions
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@ -102,6 +102,7 @@ MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
 MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
 MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
 MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos   mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
 MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);

 // returns 0 on success
@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // whether we need to set non-causal mask before llama_decode
 MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);

+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+


 //
 // helper functions (can be implemented based on other functions)
 //

-// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);

+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
+
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()