convert : converting mmproj for Qwen2/2.5VL from convert_hf_to_gguf (#13209)
* wip * qwen2.5vl ok * vision: fix models missing "text_config" * add test * fix test repo name * fix 32B model * Revert "fix 32B model" This reverts commit 651752f1ae25fe8a01c1e57c18cf2eca80b2774e. * clarify about 32B * rm qwen surgery script * update llava/readme * move V_ENC_EMBD_PATCH handling to Qwen2VLVisionModel
This commit is contained in:
parent
c642bc014c
commit
074e42ab31
7 changed files with 132 additions and 233 deletions
|
@ -1089,6 +1089,8 @@ class VisionModel(ModelBase):
|
|||
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
|
||||
|
||||
# get n_embd of the text model
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
text_config = {**self.hparams, **self.hparams["text_config"]}
|
||||
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
@ -2583,6 +2585,82 @@ class Qwen2VLModel(TextModel):
|
|||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLVisionModel(VisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["image_size"] = self.hparams.get("image_size", 560)
|
||||
# rename config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
|
||||
if "embed_dim" in self.hparams: # qwen2vl
|
||||
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
|
||||
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if self.global_config['model_type'] == 'qwen2_vl':
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
||||
elif self.global_config['model_type'] == 'qwen2_5_vl':
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
# find n_wa_pattern (window attention pattern)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
|
||||
n_wa_pattern = fullatt_block_indexes[0] + 1
|
||||
# validate n_wa_pattern
|
||||
for i in range(1, len(fullatt_block_indexes)):
|
||||
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
|
||||
raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
|
||||
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
|
||||
else:
|
||||
raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
if ".position_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("visual."):
|
||||
# process visual tensors
|
||||
# split QKV tensors if needed
|
||||
if ".qkv." in name:
|
||||
if data_torch.ndim == 2: # weight
|
||||
c3, _ = data_torch.shape
|
||||
else: # bias
|
||||
c3 = data_torch.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = data_torch[:c]
|
||||
wk = data_torch[c: c * 2]
|
||||
wv = data_torch[c * 2:]
|
||||
return [
|
||||
(self.map_tensor_name(name.replace("qkv", "q")), wq),
|
||||
(self.map_tensor_name(name.replace("qkv", "k")), wk),
|
||||
(self.map_tensor_name(name.replace("qkv", "v")), wv),
|
||||
]
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = data_torch.shape
|
||||
del c1, c2, kh, kw # unused
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
return [
|
||||
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]),
|
||||
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
|
||||
]
|
||||
else:
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
class WavTokenizerDecModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue