mtmd : support SmolVLM (version 1 and 2) (#13050)

* mtmd : support SmolVLM (version 1 and 2) * correct chat template * fix n_patches * scale_factor is an int * add more models to test
2025-04-22 16:24:54 +02:00 · 2025-04-22 16:24:54 +02:00 · dc39a5e7a8
commit dc39a5e7a8
parent ab47dec3d3
10 changed files with 279 additions and 65 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -231,11 +231,15 @@ class Keys:
        IMAGE_MEAN          = "clip.vision.image_mean"
        IMAGE_STD           = "clip.vision.image_std"
        USE_GELU            = "clip.use_gelu"
+        USE_SILU            = "clip.use_silu"

        class Attention:
            HEAD_COUNT      = "clip.vision.attention.head_count"
            LAYERNORM_EPS   = "clip.vision.attention.layer_norm_epsilon"

+        class Projector:
+            SCALE_FACTOR    = "clip.vision.projector.scale_factor"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@ -2122,6 +2126,11 @@ class GGUFValueType(IntEnum):
            raise ValueError(f"Unknown type: {type(val)}")


+class VisionProjectorType:
+    GEMMA3 = "gemma3"
+    IDEFICS3 = "idefics3"
+
+
 # Items here are (block size, type size)
 QK_K = 256
 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {