From de4c07f93783a1a96456a44dc16b9db538ee1618 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 12 May 2025 15:06:51 +0200 Subject: [PATCH] clip : cap max image size 1024 for qwen vl model (#13478) --- tools/mtmd/clip.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0adf0316..41ba45a7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1909,16 +1909,20 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_QWEN2VL: { - // max image size = sqrt(max_pixels) - // https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json - hparams.image_size = 3584; + // max image size = sqrt(max_pixels) = 3584 + // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json + // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable + // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 + hparams.image_size = 1024; hparams.warmup_image_size = hparams.patch_size * 8; } break; case PROJECTOR_TYPE_QWEN25VL: { // max image size = sqrt(max_pixels) // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json - hparams.image_size = 3584; + // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable + // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 + hparams.image_size = 1024; hparams.warmup_image_size = hparams.patch_size * 8; get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); } break;