mtmd : add vision support for llama 4 (#13282)

* wip llama 4 conversion

* rm redundant __init__

* fix conversion

* fix conversion

* test impl

* try this

* reshape patch_embeddings_0

* fix view

* rm ffn_post_norm

* cgraph ok

* f32 for pos embd

* add image marker tokens

* Llama4UnfoldConvolution

* correct pixel shuffle

* fix merge conflicts

* correct

* add debug_graph

* logits matched, but it still preceives the image incorrectly

* fix style

* add image_grid_pinpoints

* handle llama 4 preprocessing

* rm load_image_size

* rm unused line

* fix

* small fix 2

* add test & docs

* fix llava-1.6 test

* test: add notion of huge models

* add comment

* add warn about degraded quality
This commit is contained in:
Xuan-Son Nguyen 2025-05-19 13:04:14 +02:00 committed by GitHub
parent f71f40a284
commit 92ecdcc06a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 424 additions and 82 deletions

View file

@ -482,14 +482,15 @@ class MODEL_TENSOR(IntEnum):
V_ENC_EMBD_CLS = auto()
V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_POS = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_ATTN_Q = auto()
V_ENC_ATTN_Q_NORM = auto()
V_ENC_ATTN_K = auto()
V_ENC_ATTN_K_NORM = auto()
V_ENC_ATTN_V = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_OUTPUT = auto()
V_ENC_OUTPUT_NORM = auto()
V_ENC_ATTN_O = auto()
V_ENC_ATTN_O_NORM = auto()
V_ENC_POST_ATTN_NORM = auto()
V_ENC_FFN_UP = auto()
V_ENC_FFN_GATE = auto()
V_ENC_FFN_DOWN = auto()
@ -749,8 +750,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2",
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
@ -785,14 +787,15 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_O,
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_GATE,
MODEL_TENSOR.V_ENC_FFN_DOWN,
@ -2180,6 +2183,7 @@ class VisionProjectorType:
GEMMA3 = "gemma3"
IDEFICS3 = "idefics3"
PIXTRAL = "pixtral"
LLAMA4 = "llama4"
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
INTERNVL = "internvl"