mtmd : add vision support for llama 4 (#13282)
* wip llama 4 conversion * rm redundant __init__ * fix conversion * fix conversion * test impl * try this * reshape patch_embeddings_0 * fix view * rm ffn_post_norm * cgraph ok * f32 for pos embd * add image marker tokens * Llama4UnfoldConvolution * correct pixel shuffle * fix merge conflicts * correct * add debug_graph * logits matched, but it still preceives the image incorrectly * fix style * add image_grid_pinpoints * handle llama 4 preprocessing * rm load_image_size * rm unused line * fix * small fix 2 * add test & docs * fix llava-1.6 test * test: add notion of huge models * add comment * add warn about degraded quality
This commit is contained in:
parent
f71f40a284
commit
92ecdcc06a
9 changed files with 424 additions and 82 deletions
|
@ -47,10 +47,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
// this should be equal to the embedding dimension of the text model
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||
|
||||
struct clip_image_size * clip_image_size_init(void);
|
||||
struct clip_image_u8 * clip_image_u8_init (void);
|
||||
struct clip_image_f32 * clip_image_f32_init(void);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue