server : support audio input (#13714)

* server : support audio input

* add audio support on webui
This commit is contained in:
Xuan-Son Nguyen 2025-05-23 11:03:47 +02:00 committed by GitHub
parent faaaff5f94
commit 9ecf3e66a3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 276 additions and 173 deletions

View file

@ -1,4 +1,8 @@
import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
import {
DocumentTextIcon,
SpeakerWaveIcon,
XMarkIcon,
} from '@heroicons/react/24/outline';
import { MessageExtra } from '../utils/types';
import { useState } from 'react';
import { classNames } from '../utils/misc';
@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({
className="w-14 h-14 flex items-center justify-center"
aria-description="Document icon"
>
<DocumentTextIcon className="h-8 w-14 text-base-content/50" />
{item.type === 'audioFile' ? (
<SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
) : (
<DocumentTextIcon className="h-8 w-8 text-gray-500" />
)}
</div>
<div className="text-xs pr-4">
@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({
src={showingItem.base64Url}
alt={`Preview image for ${showingItem.name}`}
/>
) : showingItem.type === 'audioFile' ? (
<audio
controls
className="w-full"
aria-description={`Audio file ${showingItem.name}`}
>
<source
src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
type={showingItem.mimeType}
aria-description={`Audio file ${showingItem.name}`}
/>
Your browser does not support the audio element.
</audio>
) : (
<div className="overflow-x-auto">
<pre className="whitespace-pre-wrap break-words text-sm">

View file

@ -278,6 +278,13 @@ export default function ChatScreen() {
function ServerInfo() {
const { serverProps } = useAppContext();
const modalities = [];
if (serverProps?.modalities?.audio) {
modalities.push('audio');
}
if (serverProps?.modalities?.vision) {
modalities.push('vision');
}
return (
<div
className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
@ -291,6 +298,13 @@ function ServerInfo() {
<br />
<b>Build</b>: {serverProps?.build_info}
<br />
{modalities.length > 0 ? (
<>
<b>Supported modalities:</b> {modalities.join(', ')}
</>
) : (
''
)}
</p>
</div>
</div>

View file

@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
// This file handles uploading extra context items (a.k.a files)
// It allows processing these kinds of files:
// - image files (converted to base64)
// - audio files (converted to base64)
// - text files (including code files)
// - pdf (converted to text)
@ -41,96 +42,73 @@ export function useChatExtraContext(): ChatExtraContextApi {
const isSupportVision = serverProps?.modalities?.vision;
const onFileAdded = (files: File[]) => {
for (const file of files) {
const mimeType = file.type;
console.debug({ mimeType, file });
if (file.size > 10 * 1024 * 1024) {
toast.error('File is too large. Maximum size is 10MB.');
break;
}
if (mimeType.startsWith('image/')) {
if (!isSupportVision) {
toast.error('Multimodal is not supported by this server or model.');
const onFileAdded = async (files: File[]) => {
try {
for (const file of files) {
const mimeType = file.type;
if (file.size > 10 * 1024 * 1024) {
toast.error('File is too large. Maximum size is 10MB.');
break;
}
const reader = new FileReader();
reader.onload = async (event) => {
if (event.target?.result) {
let base64Url = event.target.result as string;
if (mimeType === 'image/svg+xml') {
// Convert SVG to PNG
base64Url = await svgBase64UrlToPngDataURL(base64Url);
}
if (mimeType.startsWith('image/')) {
if (!isSupportVision) {
toast.error('Multimodal is not supported by this server or model.');
break;
}
addItems([
{
let base64Url = await getFileAsBase64(file);
if (mimeType === 'image/svg+xml') {
// Convert SVG to PNG
base64Url = await svgBase64UrlToPngDataURL(base64Url);
}
addItems([
{
type: 'imageFile',
name: file.name,
base64Url,
},
]);
} else if (mimeType.startsWith('video/')) {
toast.error('Video files are not supported yet.');
break;
} else if (mimeType.startsWith('audio/')) {
if (!/mpeg|wav/.test(mimeType)) {
toast.error('Only mp3 and wav audio files are supported.');
break;
}
// plain base64, not a data URL
const base64Data = await getFileAsBase64(file, false);
addItems([
{
type: 'audioFile',
name: file.name,
mimeType,
base64Data,
},
]);
} else if (mimeType.startsWith('application/pdf')) {
if (config.pdfAsImage && !isSupportVision) {
toast(
'Multimodal is not supported, PDF will be converted to text instead of image.'
);
break;
}
if (config.pdfAsImage && isSupportVision) {
// Convert PDF to images
const base64Urls = await convertPDFToImage(file);
addItems(
base64Urls.map((base64Url) => ({
type: 'imageFile',
name: file.name,
base64Url,
},
]);
}
};
reader.readAsDataURL(file);
} else if (
mimeType.startsWith('video/') ||
mimeType.startsWith('audio/')
) {
toast.error('Video and audio files are not supported yet.');
break;
} else if (mimeType.startsWith('application/pdf')) {
if (config.pdfAsImage && !isSupportVision) {
toast(
'Multimodal is not supported, PDF will be converted to text instead of image.'
);
break;
}
const promise =
config.pdfAsImage && isSupportVision
? convertPDFToImage(file).then((base64Urls) => {
addItems(
base64Urls.map((base64Url) => ({
type: 'imageFile',
name: file.name,
base64Url,
}))
);
})
: convertPDFToText(file).then((content) => {
if (isSupportVision) {
toast.success(
'PDF file converted to text. You can also convert it to image, see in Settings.'
);
}
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
});
promise.catch((error) => {
console.error(error);
toast.error('Failed to parse PDF file.');
});
break;
} else {
// Because there can be many text file types (like code file), we will not check the mime type
// and will just check if the file is not binary.
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
const content = event.target.result as string;
if (!isLikelyNotBinary(content)) {
toast.error('File is binary. Please upload a text file.');
return;
}
}))
);
} else {
// Convert PDF to text
const content = await convertPDFToText(file);
addItems([
{
type: 'textFile',
@ -138,10 +116,40 @@ export function useChatExtraContext(): ChatExtraContextApi {
content,
},
]);
if (isSupportVision) {
toast.success(
'PDF file converted to text. You can also convert it to image, see in Settings.'
);
}
}
};
reader.readAsText(file);
break;
} else {
// Because there can be many text file types (like code file), we will not check the mime type
// and will just check if the file is not binary.
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
const content = event.target.result as string;
if (!isLikelyNotBinary(content)) {
toast.error('File is binary. Please upload a text file.');
return;
}
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
}
};
reader.readAsText(file);
}
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
const errorMessage = `Error processing file: ${message}`;
toast.error(errorMessage);
}
};
@ -154,6 +162,25 @@ export function useChatExtraContext(): ChatExtraContextApi {
};
}
async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
let result = event.target.result as string;
if (!outputUrl) {
// remove base64 url prefix and correct characters
result = result.substring(result.indexOf(',') + 1);
}
resolve(result);
} else {
reject(new Error('Failed to read file.'));
}
};
reader.readAsDataURL(file);
});
}
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
const reader = new FileReader();

View file

@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
type: 'image_url',
image_url: { url: extra.base64Url },
});
} else if (extra.type === 'audioFile') {
contentArr.push({
type: 'input_audio',
input_audio: {
data: extra.base64Data,
format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
},
});
} else {
throw new Error('Unknown extra type');
}

View file

@ -51,6 +51,7 @@ export interface Message {
export type MessageExtra =
| MessageExtraTextFile
| MessageExtraImageFile
| MessageExtraAudioFile
| MessageExtraContext;
export interface MessageExtraTextFile {
@ -65,6 +66,13 @@ export interface MessageExtraImageFile {
base64Url: string;
}
export interface MessageExtraAudioFile {
type: 'audioFile';
name: string;
base64Data: string;
mimeType: string;
}
export interface MessageExtraContext {
type: 'context';
name: string;
@ -79,6 +87,10 @@ export type APIMessageContentPart =
| {
type: 'image_url';
image_url: { url: string };
}
| {
type: 'input_audio';
input_audio: { data: string; format: 'wav' | 'mp3' };
};
export type APIMessage = {
@ -120,6 +132,7 @@ export interface LlamaCppServerProps {
n_ctx: number;
modalities?: {
vision: boolean;
audio: boolean;
};
// TODO: support params
}