server : support audio input (#13714)
* server : support audio input * add audio support on webui
This commit is contained in:
parent
faaaff5f94
commit
9ecf3e66a3
12 changed files with 276 additions and 173 deletions
|
@ -12,17 +12,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
||||||
size_t n_tokens = 0;
|
size_t n_tokens = 0;
|
||||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
|
||||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
||||||
size_t n_tokens_text;
|
|
||||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
|
||||||
n_tokens += n_tokens_text;
|
|
||||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
||||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
|
||||||
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false && "chunk type not supported");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return n_tokens;
|
return n_tokens;
|
||||||
}
|
}
|
||||||
|
@ -31,17 +21,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||||
llama_pos n_pos = 0;
|
llama_pos n_pos = 0;
|
||||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
n_pos += mtmd_input_chunk_get_n_pos(chunk);
|
||||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
||||||
size_t n_tokens_text;
|
|
||||||
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
|
|
||||||
n_pos += n_tokens_text;
|
|
||||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
||||||
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
|
|
||||||
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false && "chunk type not supported");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return n_pos;
|
return n_pos;
|
||||||
}
|
}
|
||||||
|
|
|
@ -751,6 +751,10 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
|
||||||
return bitmap->data.data();
|
return bitmap->data.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
|
||||||
|
return bitmap->data.size();
|
||||||
|
}
|
||||||
|
|
||||||
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
|
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
|
||||||
return bitmap->is_audio;
|
return bitmap->is_audio;
|
||||||
}
|
}
|
||||||
|
|
|
@ -119,11 +119,12 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||||
// the data is in float format (PCM F32)
|
// the data is in float format (PCM F32)
|
||||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
||||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
||||||
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
||||||
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
|
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
|
||||||
MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
|
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
|
||||||
MTMD_API bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap);
|
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
|
||||||
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
|
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
|
||||||
|
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
|
||||||
// bitmap ID is optional, but useful for KV cache tracking
|
// bitmap ID is optional, but useful for KV cache tracking
|
||||||
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
|
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
|
||||||
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
|
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
|
||||||
|
@ -322,6 +323,7 @@ struct bitmap {
|
||||||
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||||
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||||
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
||||||
|
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||||
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
||||||
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||||
};
|
};
|
||||||
|
|
Binary file not shown.
|
@ -1891,6 +1891,7 @@ struct server_context {
|
||||||
float slot_prompt_similarity = 0.0f;
|
float slot_prompt_similarity = 0.0f;
|
||||||
|
|
||||||
common_chat_templates_ptr chat_templates;
|
common_chat_templates_ptr chat_templates;
|
||||||
|
oaicompat_parser_options oai_parser_opt;
|
||||||
|
|
||||||
~server_context() {
|
~server_context() {
|
||||||
mtmd_free(mctx);
|
mtmd_free(mctx);
|
||||||
|
@ -2086,6 +2087,15 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics.init();
|
metrics.init();
|
||||||
|
|
||||||
|
oai_parser_opt = {
|
||||||
|
/* use_jinja */ params_base.use_jinja,
|
||||||
|
/* prefill_assistant */ params_base.prefill_assistant,
|
||||||
|
/* reasoning_format */ params_base.reasoning_format,
|
||||||
|
/* common_chat_templates */ chat_templates.get(),
|
||||||
|
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||||
|
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
server_slot * get_slot_by_id(int id) {
|
server_slot * get_slot_by_id(int id) {
|
||||||
|
@ -4092,7 +4102,10 @@ int main(int argc, char ** argv) {
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||||
{ "model_path", ctx_server.params_base.model.path },
|
{ "model_path", ctx_server.params_base.model.path },
|
||||||
{ "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
|
{ "modalities", json{
|
||||||
|
{"vision", ctx_server.oai_parser_opt.allow_image},
|
||||||
|
{"audio", ctx_server.oai_parser_opt.allow_audio},
|
||||||
|
} },
|
||||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||||
|
@ -4183,10 +4196,10 @@ int main(int argc, char ** argv) {
|
||||||
for (auto & file : files) {
|
for (auto & file : files) {
|
||||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
||||||
if (!bmp.ptr) {
|
if (!bmp.ptr) {
|
||||||
throw std::runtime_error("Failed to load image");
|
throw std::runtime_error("Failed to load image or audio file");
|
||||||
}
|
}
|
||||||
// calculate bitmap hash (for KV caching)
|
// calculate bitmap hash (for KV caching)
|
||||||
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
|
||||||
bmp.set_id(hash.c_str());
|
bmp.set_id(hash.c_str());
|
||||||
bitmaps.entries.push_back(std::move(bmp));
|
bitmaps.entries.push_back(std::move(bmp));
|
||||||
}
|
}
|
||||||
|
@ -4418,7 +4431,7 @@ int main(int argc, char ** argv) {
|
||||||
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
LOG_DBG("request: %s\n", req.body.c_str());
|
LOG_DBG("request: %s\n", req.body.c_str());
|
||||||
if (ctx_server.params_base.embedding) {
|
if (ctx_server.params_base.embedding) {
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
@ -4427,13 +4440,9 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
auto body = json::parse(req.body);
|
auto body = json::parse(req.body);
|
||||||
std::vector<raw_buffer> files;
|
std::vector<raw_buffer> files;
|
||||||
json data = oaicompat_completion_params_parse(
|
json data = oaicompat_chat_params_parse(
|
||||||
body,
|
body,
|
||||||
params.use_jinja,
|
ctx_server.oai_parser_opt,
|
||||||
params.prefill_assistant,
|
|
||||||
params.reasoning_format,
|
|
||||||
ctx_server.chat_templates.get(),
|
|
||||||
ctx_server.mctx,
|
|
||||||
files);
|
files);
|
||||||
|
|
||||||
handle_completions_impl(
|
handle_completions_impl(
|
||||||
|
@ -4446,16 +4455,12 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
// same with handle_chat_completions, but without inference part
|
// same with handle_chat_completions, but without inference part
|
||||||
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
auto body = json::parse(req.body);
|
auto body = json::parse(req.body);
|
||||||
std::vector<raw_buffer> files; // dummy, unused
|
std::vector<raw_buffer> files; // dummy, unused
|
||||||
json data = oaicompat_completion_params_parse(
|
json data = oaicompat_chat_params_parse(
|
||||||
body,
|
body,
|
||||||
params.use_jinja,
|
ctx_server.oai_parser_opt,
|
||||||
params.prefill_assistant,
|
|
||||||
params.reasoning_format,
|
|
||||||
ctx_server.chat_templates.get(),
|
|
||||||
ctx_server.mctx,
|
|
||||||
files);
|
files);
|
||||||
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
|
||||||
};
|
};
|
||||||
|
|
|
@ -30,6 +30,7 @@ def create_server():
|
||||||
("What is this:\n", "malformed", False, None),
|
("What is this:\n", "malformed", False, None),
|
||||||
("What is this:\n", "https://google.com/404", False, None), # non-existent image
|
("What is this:\n", "https://google.com/404", False, None), # non-existent image
|
||||||
("What is this:\n", "https://ggml.ai", False, None), # non-image data
|
("What is this:\n", "https://ggml.ai", False, None), # non-image data
|
||||||
|
# TODO @ngxson : test with multiple images, no images and with audio
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_vision_chat_completion(prompt, image_url, success, re_content):
|
def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||||
|
|
|
@ -536,6 +536,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
|
||||||
// OAI utils
|
// OAI utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// used by /completions endpoint
|
||||||
static json oaicompat_completion_params_parse(const json & body) {
|
static json oaicompat_completion_params_parse(const json & body) {
|
||||||
json llama_params;
|
json llama_params;
|
||||||
|
|
||||||
|
@ -580,13 +581,19 @@ static json oaicompat_completion_params_parse(const json & body) {
|
||||||
return llama_params;
|
return llama_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
static json oaicompat_completion_params_parse(
|
struct oaicompat_parser_options {
|
||||||
|
bool use_jinja;
|
||||||
|
bool prefill_assistant;
|
||||||
|
common_reasoning_format reasoning_format;
|
||||||
|
common_chat_templates * tmpls;
|
||||||
|
bool allow_image;
|
||||||
|
bool allow_audio;
|
||||||
|
};
|
||||||
|
|
||||||
|
// used by /chat/completions endpoint
|
||||||
|
static json oaicompat_chat_params_parse(
|
||||||
const json & body, /* openai api json semantics */
|
const json & body, /* openai api json semantics */
|
||||||
bool use_jinja,
|
const oaicompat_parser_options & opt,
|
||||||
bool prefill_assistant,
|
|
||||||
common_reasoning_format reasoning_format,
|
|
||||||
const struct common_chat_templates * tmpls,
|
|
||||||
bool allow_non_text,
|
|
||||||
std::vector<raw_buffer> & out_files)
|
std::vector<raw_buffer> & out_files)
|
||||||
{
|
{
|
||||||
json llama_params;
|
json llama_params;
|
||||||
|
@ -598,11 +605,11 @@ static json oaicompat_completion_params_parse(
|
||||||
if (stream) {
|
if (stream) {
|
||||||
throw std::runtime_error("Cannot use tools with stream");
|
throw std::runtime_error("Cannot use tools with stream");
|
||||||
}
|
}
|
||||||
if (!use_jinja) {
|
if (!opt.use_jinja) {
|
||||||
throw std::runtime_error("tools param requires --jinja flag");
|
throw std::runtime_error("tools param requires --jinja flag");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!use_jinja) {
|
if (!opt.use_jinja) {
|
||||||
if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
|
if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
|
||||||
throw std::runtime_error("Unsupported param: tool_choice");
|
throw std::runtime_error("Unsupported param: tool_choice");
|
||||||
}
|
}
|
||||||
|
@ -667,12 +674,12 @@ static json oaicompat_completion_params_parse(
|
||||||
|
|
||||||
for (auto & p : content) {
|
for (auto & p : content) {
|
||||||
std::string type = json_value(p, "type", std::string());
|
std::string type = json_value(p, "type", std::string());
|
||||||
json image_url = json_value(p, "image_url", json::object());
|
|
||||||
if (type == "image_url") {
|
if (type == "image_url") {
|
||||||
if (!allow_non_text) {
|
if (!opt.allow_image) {
|
||||||
throw std::runtime_error("image input is not supported by this server");
|
throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json image_url = json_value(p, "image_url", json::object());
|
||||||
std::string url = json_value(image_url, "url", std::string());
|
std::string url = json_value(image_url, "url", std::string());
|
||||||
if (string_starts_with(url, "http")) {
|
if (string_starts_with(url, "http")) {
|
||||||
// download remote image
|
// download remote image
|
||||||
|
@ -712,6 +719,29 @@ static json oaicompat_completion_params_parse(
|
||||||
p["type"] = "text";
|
p["type"] = "text";
|
||||||
p["text"] = mtmd_default_marker();
|
p["text"] = mtmd_default_marker();
|
||||||
p.erase("image_url");
|
p.erase("image_url");
|
||||||
|
|
||||||
|
} else if (type == "input_audio") {
|
||||||
|
if (!opt.allow_audio) {
|
||||||
|
throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
|
||||||
|
}
|
||||||
|
|
||||||
|
json input_audio = json_value(p, "input_audio", json::object());
|
||||||
|
std::string data = json_value(input_audio, "data", std::string());
|
||||||
|
std::string format = json_value(input_audio, "format", std::string());
|
||||||
|
// while we also support flac, we don't allow it here so we matches the OAI spec
|
||||||
|
if (format != "wav" && format != "mp3") {
|
||||||
|
throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
|
||||||
|
}
|
||||||
|
auto decoded_data = base64_decode(data); // expected to be base64 encoded
|
||||||
|
out_files.push_back(decoded_data);
|
||||||
|
|
||||||
|
// replace this chunk with a marker
|
||||||
|
p["type"] = "text";
|
||||||
|
p["text"] = mtmd_default_marker();
|
||||||
|
p.erase("input_audio");
|
||||||
|
|
||||||
|
} else if (type != "text") {
|
||||||
|
throw std::runtime_error("unsupported content[].type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -723,9 +753,9 @@ static json oaicompat_completion_params_parse(
|
||||||
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
||||||
inputs.grammar = grammar;
|
inputs.grammar = grammar;
|
||||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||||
inputs.use_jinja = use_jinja;
|
inputs.use_jinja = opt.use_jinja;
|
||||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||||
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
inputs.extract_reasoning = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
||||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||||
|
@ -733,7 +763,7 @@ static json oaicompat_completion_params_parse(
|
||||||
|
|
||||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||||
// for ex. this can be useful to modify the reasoning process in reasoning models
|
// for ex. this can be useful to modify the reasoning process in reasoning models
|
||||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
|
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
|
||||||
common_chat_msg last_message;
|
common_chat_msg last_message;
|
||||||
if (prefill_assistant_message) {
|
if (prefill_assistant_message) {
|
||||||
last_message = inputs.messages.back();
|
last_message = inputs.messages.back();
|
||||||
|
@ -749,7 +779,7 @@ static json oaicompat_completion_params_parse(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply chat template to the list of messages
|
// Apply chat template to the list of messages
|
||||||
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
||||||
|
|
||||||
/* Append assistant prefilled message */
|
/* Append assistant prefilled message */
|
||||||
if (prefill_assistant_message) {
|
if (prefill_assistant_message) {
|
||||||
|
@ -1040,7 +1070,7 @@ struct server_tokens {
|
||||||
private: // disallow accessing these members directly, risking out-of-sync
|
private: // disallow accessing these members directly, risking out-of-sync
|
||||||
|
|
||||||
// map a **start** position in tokens to the image chunk
|
// map a **start** position in tokens to the image chunk
|
||||||
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
|
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
|
||||||
|
|
||||||
// list of tokens
|
// list of tokens
|
||||||
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
|
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
|
||||||
|
@ -1051,7 +1081,7 @@ private: // disallow accessing these members directly, risking out-of-sync
|
||||||
// for ex. with input of 5 text tokens and 2 images:
|
// for ex. with input of 5 text tokens and 2 images:
|
||||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
||||||
// pos 0 1 2 3 4 5 6 7 8 9
|
// pos 0 1 2 3 4 5 6 7 8 9
|
||||||
// map_pos_to_image will contain: {5, img0}, {8, img1}
|
// map_pos_to_media will contain: {5, img0}, {8, img1}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
server_tokens() = default;
|
server_tokens() = default;
|
||||||
|
@ -1090,15 +1120,15 @@ public:
|
||||||
}
|
}
|
||||||
oss << "\n";
|
oss << "\n";
|
||||||
oss << "image pos: ";
|
oss << "image pos: ";
|
||||||
for (const auto & it : map_pos_to_image) {
|
for (const auto & it : map_pos_to_media) {
|
||||||
oss << it.first << ", ";
|
oss << it.first << ", ";
|
||||||
}
|
}
|
||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
|
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
|
||||||
auto it = map_pos_to_image.find(pos);
|
auto it = map_pos_to_media.find(pos);
|
||||||
if (it != map_pos_to_image.end()) {
|
if (it != map_pos_to_media.end()) {
|
||||||
return it->second;
|
return it->second;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error("Chunk not found");
|
throw std::runtime_error("Chunk not found");
|
||||||
|
@ -1115,16 +1145,15 @@ public:
|
||||||
// will create a copy of the chunk if it contains non-text data
|
// will create a copy of the chunk if it contains non-text data
|
||||||
void push_back(const mtmd_input_chunk * chunk) {
|
void push_back(const mtmd_input_chunk * chunk) {
|
||||||
auto type = mtmd_input_chunk_get_type(chunk);
|
auto type = mtmd_input_chunk_get_type(chunk);
|
||||||
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||||
GGML_ASSERT(has_mtmd);
|
GGML_ASSERT(has_mtmd);
|
||||||
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
|
||||||
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
||||||
llama_pos start_pos = tokens.size();
|
llama_pos start_pos = tokens.size();
|
||||||
for (int i = 0; i < n_pos; ++i) {
|
for (int i = 0; i < n_pos; ++i) {
|
||||||
tokens.emplace_back(LLAMA_TOKEN_NULL);
|
tokens.emplace_back(LLAMA_TOKEN_NULL);
|
||||||
}
|
}
|
||||||
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
||||||
map_pos_to_image[start_pos] = std::move(new_chunk);
|
map_pos_to_media[start_pos] = std::move(new_chunk);
|
||||||
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||||
size_t n_tokens;
|
size_t n_tokens;
|
||||||
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||||
|
@ -1169,6 +1198,9 @@ public:
|
||||||
void keep_first(size_t n) {
|
void keep_first(size_t n) {
|
||||||
GGML_ASSERT(n <= tokens.size());
|
GGML_ASSERT(n <= tokens.size());
|
||||||
if (has_mtmd) {
|
if (has_mtmd) {
|
||||||
|
if (n == tokens.size()) {
|
||||||
|
return; // nothing to do
|
||||||
|
}
|
||||||
// we throw an error if we try to remove a token in the middle of an image
|
// we throw an error if we try to remove a token in the middle of an image
|
||||||
// for ex. with input of 5 text tokens and 2 images:
|
// for ex. with input of 5 text tokens and 2 images:
|
||||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
||||||
|
@ -1183,10 +1215,10 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// remove all image chunks that are not used anymore
|
// remove all image chunks that are not used anymore
|
||||||
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
|
for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
|
||||||
llama_pos pos = it->first;
|
llama_pos pos = it->first;
|
||||||
if (pos >= (llama_pos)n) {
|
if (pos >= (llama_pos)n) {
|
||||||
it = map_pos_to_image.erase(it);
|
it = map_pos_to_media.erase(it);
|
||||||
} else {
|
} else {
|
||||||
++it;
|
++it;
|
||||||
}
|
}
|
||||||
|
@ -1217,14 +1249,12 @@ public:
|
||||||
const auto & a_chunk = find_chunk(i);
|
const auto & a_chunk = find_chunk(i);
|
||||||
const auto & b_chunk = b.find_chunk(i);
|
const auto & b_chunk = b.find_chunk(i);
|
||||||
GGML_ASSERT(a_chunk && b_chunk);
|
GGML_ASSERT(a_chunk && b_chunk);
|
||||||
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
|
std::string ai_id = mtmd_input_chunk_get_id(a_chunk.get());
|
||||||
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
|
std::string bi_id = mtmd_input_chunk_get_id(b_chunk.get());
|
||||||
std::string ai_id = mtmd_image_tokens_get_id(a_img);
|
size_t a_pos = mtmd_input_chunk_get_n_pos(a_chunk.get());
|
||||||
std::string bi_id = mtmd_image_tokens_get_id(b_img);
|
size_t b_pos = mtmd_input_chunk_get_n_pos(b_chunk.get());
|
||||||
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
|
|
||||||
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
|
|
||||||
if (ai_id == bi_id && a_pos == b_pos) {
|
if (ai_id == bi_id && a_pos == b_pos) {
|
||||||
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
|
GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
|
||||||
i += a_pos - 1; // will be +1 by the for loop
|
i += a_pos - 1; // will be +1 by the for loop
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1250,8 +1280,7 @@ public:
|
||||||
if (t == LLAMA_TOKEN_NULL) {
|
if (t == LLAMA_TOKEN_NULL) {
|
||||||
try {
|
try {
|
||||||
const auto & chunk = find_chunk(i);
|
const auto & chunk = find_chunk(i);
|
||||||
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
|
size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
|
||||||
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
||||||
i += n_pos - 1; // will be +1 by the for loop
|
i += n_pos - 1; // will be +1 by the for loop
|
||||||
} catch (const std::exception & e) {
|
} catch (const std::exception & e) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1270,22 +1299,21 @@ public:
|
||||||
llama_pos n_past,
|
llama_pos n_past,
|
||||||
int32_t seq_id,
|
int32_t seq_id,
|
||||||
llama_pos & n_pos_out) {
|
llama_pos & n_pos_out) {
|
||||||
auto it = map_pos_to_image.find(n_past);
|
auto & chunk = find_chunk(n_past);
|
||||||
if (it == map_pos_to_image.end()) {
|
const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
|
||||||
throw std::runtime_error("Chunk not found");
|
? "image" : "audio";
|
||||||
}
|
SRV_INF("processing %s...\n", name);
|
||||||
SRV_INF("%s\n", "processing image...");
|
|
||||||
int32_t n_batch = llama_n_batch(ctx);
|
int32_t n_batch = llama_n_batch(ctx);
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
llama_pos new_n_past = n_past;
|
llama_pos new_n_past = n_past;
|
||||||
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
||||||
it->second.get(), // chunk
|
chunk.get(),
|
||||||
n_past,
|
n_past,
|
||||||
seq_id,
|
seq_id,
|
||||||
n_batch,
|
n_batch,
|
||||||
true, // logits last
|
true, // logits last
|
||||||
&new_n_past);
|
&new_n_past);
|
||||||
SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
||||||
n_pos_out = n_past;
|
n_pos_out = n_past;
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
|
import {
|
||||||
|
DocumentTextIcon,
|
||||||
|
SpeakerWaveIcon,
|
||||||
|
XMarkIcon,
|
||||||
|
} from '@heroicons/react/24/outline';
|
||||||
import { MessageExtra } from '../utils/types';
|
import { MessageExtra } from '../utils/types';
|
||||||
import { useState } from 'react';
|
import { useState } from 'react';
|
||||||
import { classNames } from '../utils/misc';
|
import { classNames } from '../utils/misc';
|
||||||
|
@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({
|
||||||
className="w-14 h-14 flex items-center justify-center"
|
className="w-14 h-14 flex items-center justify-center"
|
||||||
aria-description="Document icon"
|
aria-description="Document icon"
|
||||||
>
|
>
|
||||||
<DocumentTextIcon className="h-8 w-14 text-base-content/50" />
|
{item.type === 'audioFile' ? (
|
||||||
|
<SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
|
||||||
|
) : (
|
||||||
|
<DocumentTextIcon className="h-8 w-8 text-gray-500" />
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="text-xs pr-4">
|
<div className="text-xs pr-4">
|
||||||
|
@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({
|
||||||
src={showingItem.base64Url}
|
src={showingItem.base64Url}
|
||||||
alt={`Preview image for ${showingItem.name}`}
|
alt={`Preview image for ${showingItem.name}`}
|
||||||
/>
|
/>
|
||||||
|
) : showingItem.type === 'audioFile' ? (
|
||||||
|
<audio
|
||||||
|
controls
|
||||||
|
className="w-full"
|
||||||
|
aria-description={`Audio file ${showingItem.name}`}
|
||||||
|
>
|
||||||
|
<source
|
||||||
|
src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
|
||||||
|
type={showingItem.mimeType}
|
||||||
|
aria-description={`Audio file ${showingItem.name}`}
|
||||||
|
/>
|
||||||
|
Your browser does not support the audio element.
|
||||||
|
</audio>
|
||||||
) : (
|
) : (
|
||||||
<div className="overflow-x-auto">
|
<div className="overflow-x-auto">
|
||||||
<pre className="whitespace-pre-wrap break-words text-sm">
|
<pre className="whitespace-pre-wrap break-words text-sm">
|
||||||
|
|
|
@ -278,6 +278,13 @@ export default function ChatScreen() {
|
||||||
|
|
||||||
function ServerInfo() {
|
function ServerInfo() {
|
||||||
const { serverProps } = useAppContext();
|
const { serverProps } = useAppContext();
|
||||||
|
const modalities = [];
|
||||||
|
if (serverProps?.modalities?.audio) {
|
||||||
|
modalities.push('audio');
|
||||||
|
}
|
||||||
|
if (serverProps?.modalities?.vision) {
|
||||||
|
modalities.push('vision');
|
||||||
|
}
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
|
className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
|
||||||
|
@ -291,6 +298,13 @@ function ServerInfo() {
|
||||||
<br />
|
<br />
|
||||||
<b>Build</b>: {serverProps?.build_info}
|
<b>Build</b>: {serverProps?.build_info}
|
||||||
<br />
|
<br />
|
||||||
|
{modalities.length > 0 ? (
|
||||||
|
<>
|
||||||
|
<b>Supported modalities:</b> {modalities.join(', ')}
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
''
|
||||||
|
)}
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
|
||||||
// This file handles uploading extra context items (a.k.a files)
|
// This file handles uploading extra context items (a.k.a files)
|
||||||
// It allows processing these kinds of files:
|
// It allows processing these kinds of files:
|
||||||
// - image files (converted to base64)
|
// - image files (converted to base64)
|
||||||
|
// - audio files (converted to base64)
|
||||||
// - text files (including code files)
|
// - text files (including code files)
|
||||||
// - pdf (converted to text)
|
// - pdf (converted to text)
|
||||||
|
|
||||||
|
@ -41,96 +42,73 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||||
|
|
||||||
const isSupportVision = serverProps?.modalities?.vision;
|
const isSupportVision = serverProps?.modalities?.vision;
|
||||||
|
|
||||||
const onFileAdded = (files: File[]) => {
|
const onFileAdded = async (files: File[]) => {
|
||||||
for (const file of files) {
|
try {
|
||||||
const mimeType = file.type;
|
for (const file of files) {
|
||||||
console.debug({ mimeType, file });
|
const mimeType = file.type;
|
||||||
if (file.size > 10 * 1024 * 1024) {
|
if (file.size > 10 * 1024 * 1024) {
|
||||||
toast.error('File is too large. Maximum size is 10MB.');
|
toast.error('File is too large. Maximum size is 10MB.');
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mimeType.startsWith('image/')) {
|
|
||||||
if (!isSupportVision) {
|
|
||||||
toast.error('Multimodal is not supported by this server or model.');
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
const reader = new FileReader();
|
|
||||||
reader.onload = async (event) => {
|
|
||||||
if (event.target?.result) {
|
|
||||||
let base64Url = event.target.result as string;
|
|
||||||
|
|
||||||
if (mimeType === 'image/svg+xml') {
|
if (mimeType.startsWith('image/')) {
|
||||||
// Convert SVG to PNG
|
if (!isSupportVision) {
|
||||||
base64Url = await svgBase64UrlToPngDataURL(base64Url);
|
toast.error('Multimodal is not supported by this server or model.');
|
||||||
}
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
addItems([
|
let base64Url = await getFileAsBase64(file);
|
||||||
{
|
if (mimeType === 'image/svg+xml') {
|
||||||
|
// Convert SVG to PNG
|
||||||
|
base64Url = await svgBase64UrlToPngDataURL(base64Url);
|
||||||
|
}
|
||||||
|
addItems([
|
||||||
|
{
|
||||||
|
type: 'imageFile',
|
||||||
|
name: file.name,
|
||||||
|
base64Url,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
} else if (mimeType.startsWith('video/')) {
|
||||||
|
toast.error('Video files are not supported yet.');
|
||||||
|
break;
|
||||||
|
} else if (mimeType.startsWith('audio/')) {
|
||||||
|
if (!/mpeg|wav/.test(mimeType)) {
|
||||||
|
toast.error('Only mp3 and wav audio files are supported.');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// plain base64, not a data URL
|
||||||
|
const base64Data = await getFileAsBase64(file, false);
|
||||||
|
addItems([
|
||||||
|
{
|
||||||
|
type: 'audioFile',
|
||||||
|
name: file.name,
|
||||||
|
mimeType,
|
||||||
|
base64Data,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
} else if (mimeType.startsWith('application/pdf')) {
|
||||||
|
if (config.pdfAsImage && !isSupportVision) {
|
||||||
|
toast(
|
||||||
|
'Multimodal is not supported, PDF will be converted to text instead of image.'
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.pdfAsImage && isSupportVision) {
|
||||||
|
// Convert PDF to images
|
||||||
|
const base64Urls = await convertPDFToImage(file);
|
||||||
|
addItems(
|
||||||
|
base64Urls.map((base64Url) => ({
|
||||||
type: 'imageFile',
|
type: 'imageFile',
|
||||||
name: file.name,
|
name: file.name,
|
||||||
base64Url,
|
base64Url,
|
||||||
},
|
}))
|
||||||
]);
|
);
|
||||||
}
|
} else {
|
||||||
};
|
// Convert PDF to text
|
||||||
reader.readAsDataURL(file);
|
const content = await convertPDFToText(file);
|
||||||
} else if (
|
|
||||||
mimeType.startsWith('video/') ||
|
|
||||||
mimeType.startsWith('audio/')
|
|
||||||
) {
|
|
||||||
toast.error('Video and audio files are not supported yet.');
|
|
||||||
break;
|
|
||||||
} else if (mimeType.startsWith('application/pdf')) {
|
|
||||||
if (config.pdfAsImage && !isSupportVision) {
|
|
||||||
toast(
|
|
||||||
'Multimodal is not supported, PDF will be converted to text instead of image.'
|
|
||||||
);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const promise =
|
|
||||||
config.pdfAsImage && isSupportVision
|
|
||||||
? convertPDFToImage(file).then((base64Urls) => {
|
|
||||||
addItems(
|
|
||||||
base64Urls.map((base64Url) => ({
|
|
||||||
type: 'imageFile',
|
|
||||||
name: file.name,
|
|
||||||
base64Url,
|
|
||||||
}))
|
|
||||||
);
|
|
||||||
})
|
|
||||||
: convertPDFToText(file).then((content) => {
|
|
||||||
if (isSupportVision) {
|
|
||||||
toast.success(
|
|
||||||
'PDF file converted to text. You can also convert it to image, see in Settings.'
|
|
||||||
);
|
|
||||||
}
|
|
||||||
addItems([
|
|
||||||
{
|
|
||||||
type: 'textFile',
|
|
||||||
name: file.name,
|
|
||||||
content,
|
|
||||||
},
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
promise.catch((error) => {
|
|
||||||
console.error(error);
|
|
||||||
toast.error('Failed to parse PDF file.');
|
|
||||||
});
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
// Because there can be many text file types (like code file), we will not check the mime type
|
|
||||||
// and will just check if the file is not binary.
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.onload = (event) => {
|
|
||||||
if (event.target?.result) {
|
|
||||||
const content = event.target.result as string;
|
|
||||||
if (!isLikelyNotBinary(content)) {
|
|
||||||
toast.error('File is binary. Please upload a text file.');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
addItems([
|
addItems([
|
||||||
{
|
{
|
||||||
type: 'textFile',
|
type: 'textFile',
|
||||||
|
@ -138,10 +116,40 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||||
content,
|
content,
|
||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
|
if (isSupportVision) {
|
||||||
|
toast.success(
|
||||||
|
'PDF file converted to text. You can also convert it to image, see in Settings.'
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
break;
|
||||||
reader.readAsText(file);
|
} else {
|
||||||
|
// Because there can be many text file types (like code file), we will not check the mime type
|
||||||
|
// and will just check if the file is not binary.
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = (event) => {
|
||||||
|
if (event.target?.result) {
|
||||||
|
const content = event.target.result as string;
|
||||||
|
if (!isLikelyNotBinary(content)) {
|
||||||
|
toast.error('File is binary. Please upload a text file.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
addItems([
|
||||||
|
{
|
||||||
|
type: 'textFile',
|
||||||
|
name: file.name,
|
||||||
|
content,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.readAsText(file);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} catch (error) {
|
||||||
|
const message = error instanceof Error ? error.message : String(error);
|
||||||
|
const errorMessage = `Error processing file: ${message}`;
|
||||||
|
toast.error(errorMessage);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -154,6 +162,25 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = (event) => {
|
||||||
|
if (event.target?.result) {
|
||||||
|
let result = event.target.result as string;
|
||||||
|
if (!outputUrl) {
|
||||||
|
// remove base64 url prefix and correct characters
|
||||||
|
result = result.substring(result.indexOf(',') + 1);
|
||||||
|
}
|
||||||
|
resolve(result);
|
||||||
|
} else {
|
||||||
|
reject(new Error('Failed to read file.'));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.readAsDataURL(file);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
|
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const reader = new FileReader();
|
const reader = new FileReader();
|
||||||
|
|
|
@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
|
||||||
type: 'image_url',
|
type: 'image_url',
|
||||||
image_url: { url: extra.base64Url },
|
image_url: { url: extra.base64Url },
|
||||||
});
|
});
|
||||||
|
} else if (extra.type === 'audioFile') {
|
||||||
|
contentArr.push({
|
||||||
|
type: 'input_audio',
|
||||||
|
input_audio: {
|
||||||
|
data: extra.base64Data,
|
||||||
|
format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
|
||||||
|
},
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
throw new Error('Unknown extra type');
|
throw new Error('Unknown extra type');
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,7 @@ export interface Message {
|
||||||
export type MessageExtra =
|
export type MessageExtra =
|
||||||
| MessageExtraTextFile
|
| MessageExtraTextFile
|
||||||
| MessageExtraImageFile
|
| MessageExtraImageFile
|
||||||
|
| MessageExtraAudioFile
|
||||||
| MessageExtraContext;
|
| MessageExtraContext;
|
||||||
|
|
||||||
export interface MessageExtraTextFile {
|
export interface MessageExtraTextFile {
|
||||||
|
@ -65,6 +66,13 @@ export interface MessageExtraImageFile {
|
||||||
base64Url: string;
|
base64Url: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface MessageExtraAudioFile {
|
||||||
|
type: 'audioFile';
|
||||||
|
name: string;
|
||||||
|
base64Data: string;
|
||||||
|
mimeType: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface MessageExtraContext {
|
export interface MessageExtraContext {
|
||||||
type: 'context';
|
type: 'context';
|
||||||
name: string;
|
name: string;
|
||||||
|
@ -79,6 +87,10 @@ export type APIMessageContentPart =
|
||||||
| {
|
| {
|
||||||
type: 'image_url';
|
type: 'image_url';
|
||||||
image_url: { url: string };
|
image_url: { url: string };
|
||||||
|
}
|
||||||
|
| {
|
||||||
|
type: 'input_audio';
|
||||||
|
input_audio: { data: string; format: 'wav' | 'mp3' };
|
||||||
};
|
};
|
||||||
|
|
||||||
export type APIMessage = {
|
export type APIMessage = {
|
||||||
|
@ -120,6 +132,7 @@ export interface LlamaCppServerProps {
|
||||||
n_ctx: number;
|
n_ctx: number;
|
||||||
modalities?: {
|
modalities?: {
|
||||||
vision: boolean;
|
vision: boolean;
|
||||||
|
audio: boolean;
|
||||||
};
|
};
|
||||||
// TODO: support params
|
// TODO: support params
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue