From c104023994d36a8e791fc6a43789b84fd552cefc Mon Sep 17 00:00:00 2001 From: City <125218114+city96@users.noreply.github.com> Date: Mon, 12 May 2025 00:39:06 +0200 Subject: [PATCH] mtmd : Use RMS norm for InternVL 3 38B and 78B mmproj (#13459) --- tools/mtmd/clip.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3f11c301..0adf0316 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -879,9 +879,15 @@ struct clip_graph { // add CLS token inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + // The larger models use a different ViT, which uses RMS norm instead of layer norm + // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 + norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) + ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) + : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + ggml_tensor * cur = build_vit( inp, n_pos, - NORM_TYPE_NORMAL, + norm_t, hparams.ffn_op, model.position_embeddings, nullptr);