mtmd : add vision support for llama 4 (#13282)

* wip llama 4 conversion

* rm redundant __init__

* fix conversion

* fix conversion

* test impl

* try this

* reshape patch_embeddings_0

* fix view

* rm ffn_post_norm

* cgraph ok

* f32 for pos embd

* add image marker tokens

* Llama4UnfoldConvolution

* correct pixel shuffle

* fix merge conflicts

* correct

* add debug_graph

* logits matched, but it still preceives the image incorrectly

* fix style

* add image_grid_pinpoints

* handle llama 4 preprocessing

* rm load_image_size

* rm unused line

* fix

* small fix 2

* add test & docs

* fix llava-1.6 test

* test: add notion of huge models

* add comment

* add warn about degraded quality
This commit is contained in:
Xuan-Son Nguyen 2025-05-19 13:04:14 +02:00 committed by GitHub
parent f71f40a284
commit 92ecdcc06a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 424 additions and 82 deletions

View file

@ -42,6 +42,7 @@ enum mtmd_slice_tmpl {
MTMD_SLICE_TMPL_NONE,
MTMD_SLICE_TMPL_MINICPMV_2_5,
MTMD_SLICE_TMPL_MINICPMV_2_6,
MTMD_SLICE_TMPL_LLAMA4,
// TODO @ngxson : add support for idefics (SmolVLM)
};
@ -64,15 +65,19 @@ struct mtmd_context {
int n_threads;
std::string image_marker;
// for minicpmv, we need special tokens in-between slices
// for llava-uhd style models, we need special tokens in-between slices
// minicpmv calls them "slices", llama 4 calls them "tiles"
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
bool tok_row_end_trail = false;
bool ov_img_first = false;
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
@ -96,6 +101,7 @@ struct mtmd_context {
use_mrope = clip_is_qwen2vl(ctx_clip);
projector_type proj = clip_get_projector_type(ctx_clip);
int minicpmv_version = clip_is_minicpmv(ctx_clip);
if (minicpmv_version == 2) {
// minicpmv 2.5 format:
@ -108,6 +114,8 @@ struct mtmd_context {
tok_sli_img_start = tok_ov_img_start;
tok_sli_img_end = tok_ov_img_end;
tok_row_end = lookup_token("\n");
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
// minicpmv 2.6 format:
@ -118,9 +126,25 @@ struct mtmd_context {
tok_sli_img_start = lookup_token("<slice>");
tok_sli_img_end = lookup_token("</slice>");
tok_row_end = lookup_token("\n");
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
} else if (minicpmv_version != 0) {
GGML_ASSERT(false && "unsupported minicpmv version");
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
// llama 4 format:
// <|image_start|>
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
// ... <|tile_y_separator|> <-- trailing end-of-row token
// <|image|> (overview) <-- overview image is last
// <|image_end|>
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
tok_ov_img_start = lookup_token("<|image|>");
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
tok_row_end = lookup_token("<|tile_y_separator|>");
tok_row_end_trail = true; // add trailing end-of-row token
ov_img_first = false; // overview image is last
}
}
@ -243,16 +267,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
marker_modified = ctx->image_marker + "[IMG_END]";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
// (more details in mtmd_context constructor)
marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
// <img> ... (image embeddings) ... </img>
marker_modified = "<img>" + ctx->image_marker + "</img>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
@ -328,7 +354,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
img_u8->ny = bitmaps[i_img]->ny;
img_u8->buf.resize(bitmaps[i_img]->data.size());
std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
// preprocess image
clip_image_f32_batch batch_f32;
@ -338,28 +363,40 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
return 2;
}
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
// handle llava-uhd style preprocessing
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
) {
// split batch into chunks of single images
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
GGML_ASSERT(chunks.size() > 0);
// add overview image
add_text_chunk({ctx->tok_ov_img_start});
output->entries.emplace_back(std::move(chunks.front()));
auto ov_chunk = std::move(chunks.front());
chunks.erase(chunks.begin());
add_text_chunk({ctx->tok_ov_img_end});
// add slices
// add overview image (first)
if (ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_start});
}
output->entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_end});
}
}
// add slices (or tiles)
if (!chunks.empty()) {
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
int n_row = (int)chunks.size() / n_col;
GGML_ASSERT(n_row * n_col == (int)chunks.size());
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_start});
}
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_start});
}
@ -367,8 +404,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_end});
}
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_mid});
}
}
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_row_end});
}
}
@ -377,6 +417,17 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
}
}
// add overview image (last)
if (!ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_start});
}
output->entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_end});
}
}
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
@ -427,14 +478,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
{
clip_image_size slice_size{
image_tokens->batch_f32.entries[0]->nx,
image_tokens->batch_f32.entries[0]->ny};
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
}
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;