mtmd : add vision support for llama 4 (#13282)

* wip llama 4 conversion

* rm redundant __init__

* fix conversion

* fix conversion

* test impl

* try this

* reshape patch_embeddings_0

* fix view

* rm ffn_post_norm

* cgraph ok

* f32 for pos embd

* add image marker tokens

* Llama4UnfoldConvolution

* correct pixel shuffle

* fix merge conflicts

* correct

* add debug_graph

* logits matched, but it still preceives the image incorrectly

* fix style

* add image_grid_pinpoints

* handle llama 4 preprocessing

* rm load_image_size

* rm unused line

* fix

* small fix 2

* add test & docs

* fix llava-1.6 test

* test: add notion of huge models

* add comment

* add warn about degraded quality
This commit is contained in:
Xuan-Son Nguyen 2025-05-19 13:04:14 +02:00 committed by GitHub
parent f71f40a284
commit 92ecdcc06a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 424 additions and 82 deletions

View file

@ -47,10 +47,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
// this should be equal to the embedding dimension of the text model
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
struct clip_image_size * clip_image_size_init(void);
struct clip_image_u8 * clip_image_u8_init (void);
struct clip_image_f32 * clip_image_f32_init(void);