mtmd : support Qwen 2.5 Omni (input audio+vision, no audio output) (#13784)
* mtmd : allow multiple modalities at the same time * refactor mtmd tokenizer * fix compile * ok, missing SinusoidsPositionEmbedding * first working version * fix style * more strict validate of n_embd * refactor if..else to switch * fix regression * add test for 3B * update docs * fix tokenizing with add_special * add more tests * fix test case "huge" * rm redundant code * set_position_mrope_1d rm n_tokens
This commit is contained in:
parent
72b090da2c
commit
bc583e3c63
12 changed files with 1148 additions and 744 deletions
|
@ -17,12 +17,22 @@ struct clip_image_f32;
|
|||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
enum clip_modality {
|
||||
CLIP_MODALITY_VISION,
|
||||
CLIP_MODALITY_AUDIO,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
struct clip_init_result {
|
||||
struct clip_ctx * ctx_v; // vision context
|
||||
struct clip_ctx * ctx_a; // audio context
|
||||
};
|
||||
|
||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue