mtmd : support InternVL 2.5 and 3 (#13422)

* convert : internvl support

* InternVL3-1B working

* fix regression

* rm mobilevlm from test

* fix conversion

* add test for internvl

* add to list of pre-quant

* restore boi/eoi check

* add clarify comment for norm eps
This commit is contained in:
Xuan-Son Nguyen 2025-05-10 16:26:42 +02:00 committed by GitHub
parent d8919424f1
commit 053367d149
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 243 additions and 25 deletions

View file

@ -426,7 +426,11 @@ class ModelBase:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
config = json.load(f)
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
return config
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@ -2606,6 +2610,11 @@ class Qwen2Model(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self.hf_arch == "Qwen2Model":
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
if "language_model." in name:
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("vision_model"):
# skip visual tensors
return []
yield from super().modify_tensors(data_torch, name, bid)
@ -2709,6 +2718,62 @@ class Qwen2VLVisionModel(VisionModel):
return [] # skip other tensors
@ModelBase.register("InternVisionModel")
class InternVisionModel(VisionModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
# hidden_act
if hparams["hidden_act"] == "silu":
self.gguf_writer.add_vision_use_silu(True)
elif hparams["hidden_act"] == "gelu":
self.gguf_writer.add_vision_use_gelu(True)
else:
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
# downsample_ratio
downsample_ratio = self.global_config.get("downsample_ratio")
assert downsample_ratio is not None
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, name, n_dims # unused
if ".patch_embd." in new_name:
return gguf.GGMLQuantizationType.F16
if ".position_embd." in new_name:
return gguf.GGMLQuantizationType.F32
return False
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.startswith("vision_model") or name.startswith("mlp"):
# process visual tensors
# correct name
if name.startswith("vision_model"):
name = "vision_tower." + name
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
name += ".weight"
# split QKV tensors if needed
if ".qkv." in name:
if data_torch.ndim == 2: # weight
c3, _ = data_torch.shape
else: # bias
c3 = data_torch.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = data_torch[:c]
wk = data_torch[c: c * 2]
wv = data_torch[c * 2:]
return [
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
]
return [(self.map_tensor_name(name), data_torch)]
return [] # skip other tensors
@ModelBase.register("WavTokenizerDec")
class WavTokenizerDecModel(TextModel):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@ -3360,6 +3425,11 @@ class InternLM2Model(TextModel):
head_dim = n_embd // num_heads
num_groups = num_heads // q_per_kv
name = name.replace("language_model.", "") # InternVL
if name.startswith("mlp") or name.startswith("vision_model"):
# skip visual tensors
return []
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
qkv = data_torch
@ -3433,6 +3503,10 @@ class InternLM3Model(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
name = name.replace("language_model.", "") # InternVL
if name.startswith("mlp") or name.startswith("vision_model"):
# skip visual tensors
return []
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):