mtmd : add support for Qwen2-Audio and SeaLLM-Audio (#13760)

* mtmd : add Qwen2-Audio support

* small clean up

* update discussion link

* clarify mtmd_get_output_embd

* clarification in multimodal.md

* fix ultravox bug

* ggml_cont
This commit is contained in:
Xuan-Son Nguyen 2025-05-25 14:06:32 +02:00 committed by GitHub
parent a08c1d2845
commit 40aaa8a403
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 144 additions and 52 deletions

View file

@ -146,6 +146,13 @@ struct mtmd_context {
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
}
if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) {
throw std::runtime_error(string_format(
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
"hint: you may be using wrong mmproj\n",
llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip)));
}
has_vision = clip_has_vision_encoder(ctx_clip);
has_audio = clip_has_audio_encoder(ctx_clip);
use_mrope = clip_is_qwen2vl(ctx_clip);
@ -196,7 +203,7 @@ struct mtmd_context {
ov_img_first = false; // overview image is last
}
if (proj == PROJECTOR_TYPE_ULTRAVOX) {
if (clip_has_whisper_encoder(ctx_clip)) {
// TODO @ngxson : check if model n_mel is 128 or 80
w_filters = whisper_precalc_filters::get_128_bins();
}
@ -208,7 +215,7 @@ struct mtmd_context {
}
if (has_audio) {
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
" https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
}
}
@ -327,6 +334,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
marker_modified = "<img>" + ctx->media_marker + "</img>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
marker_modified = "<|audio_bos|>" + ctx->media_marker + "<|audio_eos|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix