ai: add RAG, speech-to-text and text-to-speech
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,8 @@
|
|||||||
|
const OLLAMA_SPEECH_TO_TEXT_MODELS = new Set([
|
||||||
|
"gemma4:e2b",
|
||||||
|
"gemma4:e4b",
|
||||||
|
]);
|
||||||
|
|
||||||
|
export function isOllamaSpeechToTextModel(model: string | undefined | null): boolean {
|
||||||
|
return !!model && OLLAMA_SPEECH_TO_TEXT_MODELS.has(model.trim().toLowerCase());
|
||||||
|
}
|
||||||
@@ -0,0 +1,256 @@
|
|||||||
|
import fs, {openAsBlob} from "node:fs";
|
||||||
|
import {AiProvider} from "../model/ai-provider";
|
||||||
|
import {
|
||||||
|
getAvailableAiProviderChoices,
|
||||||
|
getProviderChoiceLabel,
|
||||||
|
normalizeAiProviderChoice,
|
||||||
|
resolveEffectiveAiProviderForUser,
|
||||||
|
} from "../common/user-ai-settings";
|
||||||
|
import {AiDownloadedFile} from "./telegram-attachments";
|
||||||
|
import {isOllamaSpeechToTextModel} from "./speech-to-text-models";
|
||||||
|
import {
|
||||||
|
createGoogleGenAiClient,
|
||||||
|
createMistralClient,
|
||||||
|
createOllamaClient,
|
||||||
|
createOpenAiClient,
|
||||||
|
resolveAiRuntimeTarget
|
||||||
|
} from "./ai-runtime-target";
|
||||||
|
import {Environment} from "../common/environment";
|
||||||
|
|
||||||
|
export type TranscribedSpeech = {
|
||||||
|
provider: AiProvider;
|
||||||
|
model: string;
|
||||||
|
text: string;
|
||||||
|
fileName: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type SpeechToTextRequest = {
|
||||||
|
provider: AiProvider;
|
||||||
|
audio: AiDownloadedFile;
|
||||||
|
signal?: AbortSignal;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type SpeechToTextProviderResolution = {
|
||||||
|
provider: AiProvider;
|
||||||
|
fallback: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type SpeechToTextResolveOptions = {
|
||||||
|
allowFallback?: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
function providerName(provider: AiProvider): string {
|
||||||
|
return getProviderChoiceLabel(provider);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isTranscribableAudioDownload(download: AiDownloadedFile): boolean {
|
||||||
|
if (download.kind === "audio") return true;
|
||||||
|
return download.kind === "video-note" && (download.mimeType?.startsWith("audio/") || download.path.toLowerCase().endsWith(".wav"));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isSpeechToTextConfigured(provider: AiProvider): boolean {
|
||||||
|
switch (provider) {
|
||||||
|
case AiProvider.OPENAI:
|
||||||
|
const openAiTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||||
|
return !!openAiTarget.apiKey && !!openAiTarget.model;
|
||||||
|
case AiProvider.GEMINI:
|
||||||
|
const geminiTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||||
|
return !!geminiTarget.apiKey && !!geminiTarget.model;
|
||||||
|
case AiProvider.MISTRAL:
|
||||||
|
const mistralTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||||
|
return !!mistralTarget.apiKey && !!mistralTarget.model;
|
||||||
|
case AiProvider.OLLAMA:
|
||||||
|
const ollamaTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||||
|
return !!ollamaTarget.baseUrl && isOllamaSpeechToTextModel(ollamaTarget.model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function resolveSpeechToTextProviderForUser(
|
||||||
|
userId: number,
|
||||||
|
preferredProvider?: AiProvider,
|
||||||
|
options: SpeechToTextResolveOptions = {},
|
||||||
|
): Promise<SpeechToTextProviderResolution> {
|
||||||
|
const allowFallback = options.allowFallback ?? true;
|
||||||
|
const availableChoices = getAvailableAiProviderChoices(userId);
|
||||||
|
const allowedProviders = availableChoices
|
||||||
|
.map(choice => normalizeAiProviderChoice(choice))
|
||||||
|
.filter((choice): choice is AiProvider => !!choice && choice !== "DEFAULT");
|
||||||
|
|
||||||
|
if (preferredProvider) {
|
||||||
|
if (!allowedProviders.includes(preferredProvider)) {
|
||||||
|
throw new Error(Environment.getProviderNotAvailableForAccessText(providerName(preferredProvider)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isSpeechToTextConfigured(preferredProvider)) {
|
||||||
|
return {provider: preferredProvider, fallback: false};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!allowFallback) {
|
||||||
|
throw new Error(Environment.getProviderSpeechToTextUnsupportedText(providerName(preferredProvider)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const effectiveProvider = await resolveEffectiveAiProviderForUser(userId);
|
||||||
|
if (isSpeechToTextConfigured(effectiveProvider)) {
|
||||||
|
return {
|
||||||
|
provider: effectiveProvider,
|
||||||
|
fallback: preferredProvider !== undefined && preferredProvider !== effectiveProvider
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const fallbackProvider = allowedProviders.find(isSpeechToTextConfigured);
|
||||||
|
if (!fallbackProvider) {
|
||||||
|
throw new Error(Environment.noSpeechToTextProviderForAccessText);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {provider: fallbackProvider, fallback: true};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function transcribeSpeech(request: SpeechToTextRequest): Promise<TranscribedSpeech> {
|
||||||
|
if (request.signal?.aborted) throw new Error("Aborted");
|
||||||
|
|
||||||
|
switch (request.provider) {
|
||||||
|
case AiProvider.OPENAI:
|
||||||
|
return transcribeOpenAiSpeech(request.audio, request.signal);
|
||||||
|
case AiProvider.GEMINI:
|
||||||
|
return transcribeGeminiSpeech(request.audio, request.signal);
|
||||||
|
case AiProvider.MISTRAL:
|
||||||
|
return transcribeMistralSpeech(request.audio, request.signal);
|
||||||
|
case AiProvider.OLLAMA:
|
||||||
|
return transcribeOllamaSpeech(request.audio, request.signal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function transcribeSpeechDownloads(provider: AiProvider, downloads: AiDownloadedFile[], signal?: AbortSignal): Promise<string> {
|
||||||
|
const audios = downloads.filter(isTranscribableAudioDownload);
|
||||||
|
const transcriptions: string[] = [];
|
||||||
|
|
||||||
|
for (const [index, audio] of audios.entries()) {
|
||||||
|
if (signal?.aborted) throw new Error("Aborted");
|
||||||
|
|
||||||
|
const result = await transcribeSpeech({provider, audio, signal});
|
||||||
|
const text = result.text.trim();
|
||||||
|
if (!text) continue;
|
||||||
|
|
||||||
|
transcriptions.push(audios.length > 1
|
||||||
|
? `[${index + 1}. ${audio.fileName}]\n${text}`
|
||||||
|
: text);
|
||||||
|
}
|
||||||
|
|
||||||
|
return transcriptions.join("\n\n").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function transcribeOpenAiSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.OPENAI, "speechToText");
|
||||||
|
const openAi = createOpenAiClient(target);
|
||||||
|
const file = fs.createReadStream(audio.path);
|
||||||
|
try {
|
||||||
|
const result = await openAi.audio.transcriptions.create({
|
||||||
|
file,
|
||||||
|
model: target.model,
|
||||||
|
}, {signal});
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: AiProvider.OPENAI,
|
||||||
|
model: target.model,
|
||||||
|
text: result.text || "",
|
||||||
|
fileName: audio.fileName,
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
file.destroy();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function transcribeMistralSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.MISTRAL, "speechToText");
|
||||||
|
const mistralAi = createMistralClient(target);
|
||||||
|
const result = await mistralAi.audio.transcriptions.complete({
|
||||||
|
model: target.model,
|
||||||
|
file: await openAsBlob(audio.path),
|
||||||
|
}, {signal});
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: AiProvider.MISTRAL,
|
||||||
|
model: target.model,
|
||||||
|
text: result.text || "",
|
||||||
|
fileName: audio.fileName,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function transcribeGeminiSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.GEMINI, "speechToText");
|
||||||
|
const geminiAi = createGoogleGenAiClient(target);
|
||||||
|
const response = await geminiAi.models.generateContent({
|
||||||
|
model: target.model,
|
||||||
|
contents: [{
|
||||||
|
role: "user",
|
||||||
|
parts: [
|
||||||
|
{text: "Transcribe the attached audio verbatim. Reply only with the transcription text. Do not answer the speaker."},
|
||||||
|
{
|
||||||
|
inlineData: {
|
||||||
|
data: audio.buffer.toString("base64"),
|
||||||
|
mimeType: audio.mimeType || "audio/wav",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}],
|
||||||
|
config: {
|
||||||
|
temperature: 0,
|
||||||
|
abortSignal: signal,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: AiProvider.GEMINI,
|
||||||
|
model: target.model,
|
||||||
|
text: collectGeminiText(response),
|
||||||
|
fileName: audio.fileName,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function transcribeOllamaSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||||
|
if (signal?.aborted) throw new Error("Aborted");
|
||||||
|
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.OLLAMA, "speechToText");
|
||||||
|
const model = target.model;
|
||||||
|
if (!isOllamaSpeechToTextModel(model)) {
|
||||||
|
throw new Error(Environment.ollamaSpeechToTextModelRequiredText);
|
||||||
|
}
|
||||||
|
|
||||||
|
const ollama = createOllamaClient(target);
|
||||||
|
const response = await ollama.chat({
|
||||||
|
model,
|
||||||
|
stream: false,
|
||||||
|
think: false,
|
||||||
|
messages: [{
|
||||||
|
role: "user",
|
||||||
|
content: "Transcribe the attached audio verbatim. Reply only with the transcription text. Do not answer the speaker.",
|
||||||
|
images: [audio.buffer.toString("base64")],
|
||||||
|
}],
|
||||||
|
options: {
|
||||||
|
temperature: 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: AiProvider.OLLAMA,
|
||||||
|
model,
|
||||||
|
text: response?.message?.content || "",
|
||||||
|
fileName: audio.fileName,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectGeminiText(response: any): string {
|
||||||
|
if (typeof response?.text === "string") return response.text;
|
||||||
|
|
||||||
|
const candidates = response?.candidates ?? [];
|
||||||
|
const candidateText = candidates
|
||||||
|
.flatMap((candidate: any) => candidate?.content?.parts ?? [])
|
||||||
|
.map((part: any) => part?.text ?? "")
|
||||||
|
.join("");
|
||||||
|
if (candidateText.trim()) return candidateText;
|
||||||
|
|
||||||
|
return (response?.candidates ?? [])
|
||||||
|
.map((output: any) => typeof output === "string" ? output : output?.content?.parts?.[0]?.text ?? "")
|
||||||
|
.join("");
|
||||||
|
}
|
||||||
@@ -0,0 +1,435 @@
|
|||||||
|
import fs from "node:fs";
|
||||||
|
import path from "node:path";
|
||||||
|
import {randomUUID} from "node:crypto";
|
||||||
|
import {FileOptions, Message} from "typescript-telegram-bot-api";
|
||||||
|
import {AiProvider} from "../model/ai-provider";
|
||||||
|
import {Environment} from "../common/environment";
|
||||||
|
import {bot} from "../index";
|
||||||
|
import {
|
||||||
|
getAvailableAiProviderChoices,
|
||||||
|
getProviderChoiceLabel,
|
||||||
|
normalizeAiProviderChoice,
|
||||||
|
resolveEffectiveAiProviderForUser,
|
||||||
|
} from "../common/user-ai-settings";
|
||||||
|
import {enqueueTelegramApiCall} from "../util/telegram-api-queue";
|
||||||
|
import {MessageStore} from "../common/message-store";
|
||||||
|
import {StoredAttachment} from "../model/stored-attachment";
|
||||||
|
import {StoredMessage} from "../model/stored-message";
|
||||||
|
import {logError} from "../util/utils";
|
||||||
|
import {SpeechRequest} from "@mistralai/mistralai/models/components";
|
||||||
|
import {createGoogleGenAiClient, createMistralClient, createOpenAiClient, resolveAiRuntimeTarget} from "./ai-runtime-target";
|
||||||
|
|
||||||
|
const MAX_TTS_TEXT_CHARS = 4096;
|
||||||
|
const TELEGRAM_FILE_LIMIT_BYTES = 50 * 1024 * 1024;
|
||||||
|
|
||||||
|
export type TextToSpeechFormat = "mp3" | "wav" | "flac" | "opus" | "aac" | "pcm";
|
||||||
|
|
||||||
|
export type SynthesizedSpeech = {
|
||||||
|
provider: AiProvider;
|
||||||
|
model: string;
|
||||||
|
voice?: string;
|
||||||
|
format: TextToSpeechFormat;
|
||||||
|
mimeType: string;
|
||||||
|
fileName: string;
|
||||||
|
path: string;
|
||||||
|
sizeBytes: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type TextToSpeechRequest = {
|
||||||
|
provider: AiProvider;
|
||||||
|
text: string;
|
||||||
|
voice?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type TextToSpeechProviderResolution = {
|
||||||
|
provider: AiProvider;
|
||||||
|
fallback: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
type SpeechFileParams = Omit<SynthesizedSpeech, "fileName" | "path" | "sizeBytes"> & {
|
||||||
|
buffer: Buffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
function ttsCacheDir(): string {
|
||||||
|
return path.join(Environment.DATA_PATH, "cache", "audio");
|
||||||
|
}
|
||||||
|
|
||||||
|
function providerName(provider: AiProvider): string {
|
||||||
|
return getProviderChoiceLabel(provider);
|
||||||
|
}
|
||||||
|
|
||||||
|
function assertText(text: string): string {
|
||||||
|
const normalized = text.trim();
|
||||||
|
if (!normalized) {
|
||||||
|
throw new Error(Environment.noTextToSynthesizeText);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized.length > MAX_TTS_TEXT_CHARS) {
|
||||||
|
throw new Error(Environment.getTextToSpeechTooLongText(normalized.length, MAX_TTS_TEXT_CHARS));
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isTextToSpeechConfigured(provider: AiProvider): boolean {
|
||||||
|
switch (provider) {
|
||||||
|
case AiProvider.OPENAI:
|
||||||
|
const openAiTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
|
||||||
|
return !!openAiTarget.apiKey && !!openAiTarget.model;
|
||||||
|
case AiProvider.GEMINI:
|
||||||
|
const geminiTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
|
||||||
|
return !!geminiTarget.apiKey && !!geminiTarget.model;
|
||||||
|
case AiProvider.MISTRAL:
|
||||||
|
const mistralTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
|
||||||
|
return !!mistralTarget.apiKey && !!mistralTarget.model;
|
||||||
|
case AiProvider.OLLAMA:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function resolveTextToSpeechProviderForUser(
|
||||||
|
userId: number,
|
||||||
|
explicitProvider?: AiProvider,
|
||||||
|
): Promise<TextToSpeechProviderResolution> {
|
||||||
|
const availableChoices = getAvailableAiProviderChoices(userId);
|
||||||
|
const allowedProviders = availableChoices
|
||||||
|
.map(choice => normalizeAiProviderChoice(choice))
|
||||||
|
.filter((choice): choice is AiProvider => !!choice && choice !== "DEFAULT");
|
||||||
|
|
||||||
|
if (explicitProvider) {
|
||||||
|
if (!allowedProviders.includes(explicitProvider)) {
|
||||||
|
throw new Error(Environment.getProviderNotAvailableForAccessText(providerName(explicitProvider)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isTextToSpeechConfigured(explicitProvider)) {
|
||||||
|
throw new Error(Environment.getProviderTextToSpeechUnsupportedText(providerName(explicitProvider)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return {provider: explicitProvider, fallback: false};
|
||||||
|
}
|
||||||
|
|
||||||
|
const effectiveProvider = await resolveEffectiveAiProviderForUser(userId);
|
||||||
|
if (isTextToSpeechConfigured(effectiveProvider)) {
|
||||||
|
return {provider: effectiveProvider, fallback: false};
|
||||||
|
}
|
||||||
|
|
||||||
|
const fallbackProvider = allowedProviders.find(isTextToSpeechConfigured);
|
||||||
|
if (!fallbackProvider) {
|
||||||
|
throw new Error(Environment.noTextToSpeechProviderForAccessText);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {provider: fallbackProvider, fallback: true};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function synthesizeSpeech(request: TextToSpeechRequest): Promise<SynthesizedSpeech> {
|
||||||
|
const text = assertText(request.text);
|
||||||
|
|
||||||
|
switch (request.provider) {
|
||||||
|
case AiProvider.OPENAI:
|
||||||
|
return synthesizeOpenAiSpeech(text, request.voice);
|
||||||
|
case AiProvider.GEMINI:
|
||||||
|
return synthesizeGeminiSpeech(text, request.voice);
|
||||||
|
case AiProvider.MISTRAL:
|
||||||
|
return synthesizeMistralSpeech(text, request.voice);
|
||||||
|
case AiProvider.OLLAMA:
|
||||||
|
throw new Error(Environment.ollamaTextToSpeechUnsupportedText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function synthesizeOpenAiSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.OPENAI, "textToSpeech");
|
||||||
|
const openAi = createOpenAiClient(target);
|
||||||
|
const response = await openAi.audio.speech.create({
|
||||||
|
model: target.model,
|
||||||
|
voice: voice || Environment.OPENAI_TTS_VOICE,
|
||||||
|
input: text,
|
||||||
|
response_format: "mp3",
|
||||||
|
instructions: Environment.OPENAI_TTS_INSTRUCTIONS,
|
||||||
|
});
|
||||||
|
|
||||||
|
const buffer = Buffer.from(await response.arrayBuffer());
|
||||||
|
|
||||||
|
return writeSpeechFile({
|
||||||
|
provider: AiProvider.OPENAI,
|
||||||
|
model: target.model,
|
||||||
|
voice: voice || Environment.OPENAI_TTS_VOICE,
|
||||||
|
buffer,
|
||||||
|
format: "mp3",
|
||||||
|
mimeType: "audio/mpeg",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function synthesizeMistralSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.MISTRAL, "textToSpeech");
|
||||||
|
const mistralAi = createMistralClient(target);
|
||||||
|
const request: SpeechRequest = {
|
||||||
|
input: text,
|
||||||
|
responseFormat: "mp3"
|
||||||
|
// stream: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (target.model) request.model = target.model;
|
||||||
|
if (voice || Environment.MISTRAL_TTS_VOICE_ID) request.voiceId = voice || Environment.MISTRAL_TTS_VOICE_ID;
|
||||||
|
|
||||||
|
const response: any = await mistralAi.audio.speech.complete(request);
|
||||||
|
const audioData = response?.audioData ?? response?.audio_data;
|
||||||
|
if (typeof audioData !== "string" || !audioData.trim()) {
|
||||||
|
throw new Error(Environment.mistralTtsNoAudioDataText);
|
||||||
|
}
|
||||||
|
|
||||||
|
const buffer = Buffer.from(audioData, "base64");
|
||||||
|
|
||||||
|
return writeSpeechFile({
|
||||||
|
provider: AiProvider.MISTRAL,
|
||||||
|
model: target.model || "mistral speech",
|
||||||
|
voice: voice || Environment.MISTRAL_TTS_VOICE_ID,
|
||||||
|
buffer,
|
||||||
|
format: "mp3",
|
||||||
|
mimeType: "audio/mpeg",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function synthesizeGeminiSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
|
||||||
|
const target = resolveAiRuntimeTarget(AiProvider.GEMINI, "textToSpeech");
|
||||||
|
const geminiAi = createGoogleGenAiClient(target);
|
||||||
|
const response: any = await geminiAi.models.generateContent({
|
||||||
|
model: target.model,
|
||||||
|
contents: text,
|
||||||
|
config: {
|
||||||
|
responseModalities: ["AUDIO"],
|
||||||
|
speechConfig: {
|
||||||
|
voiceConfig: {
|
||||||
|
prebuiltVoiceConfig: {
|
||||||
|
voiceName: voice || Environment.GEMINI_TTS_VOICE,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const audioPart = findGeminiAudioPart(response);
|
||||||
|
if (!audioPart) {
|
||||||
|
throw new Error(Environment.geminiTextToSpeechUnsupportedText);
|
||||||
|
}
|
||||||
|
|
||||||
|
const decoded = decodeGeminiAudio(audioPart.data, audioPart.mimeType);
|
||||||
|
|
||||||
|
return writeSpeechFile({
|
||||||
|
provider: AiProvider.GEMINI,
|
||||||
|
model: target.model,
|
||||||
|
voice: voice || Environment.GEMINI_TTS_VOICE,
|
||||||
|
buffer: decoded.buffer,
|
||||||
|
format: decoded.format,
|
||||||
|
mimeType: decoded.mimeType,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function findGeminiAudioPart(value: unknown): { data: string; mimeType?: string } | null {
|
||||||
|
if (!value || typeof value !== "object") return null;
|
||||||
|
const record = value as Record<string, unknown>;
|
||||||
|
|
||||||
|
const inlineData = record.inlineData ?? record.inline_data;
|
||||||
|
if (inlineData && typeof inlineData === "object") {
|
||||||
|
const inlineRecord = inlineData as Record<string, unknown>;
|
||||||
|
const data = inlineRecord.data;
|
||||||
|
const mimeType = inlineRecord.mimeType ?? inlineRecord.mime_type;
|
||||||
|
|
||||||
|
if (typeof data === "string" && (!mimeType || String(mimeType).startsWith("audio/"))) {
|
||||||
|
return {data, mimeType: typeof mimeType === "string" ? mimeType : undefined};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const child of Object.values(record)) {
|
||||||
|
if (Array.isArray(child)) {
|
||||||
|
for (const item of child) {
|
||||||
|
const found = findGeminiAudioPart(item);
|
||||||
|
if (found) return found;
|
||||||
|
}
|
||||||
|
} else if (child && typeof child === "object") {
|
||||||
|
const found = findGeminiAudioPart(child);
|
||||||
|
if (found) return found;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodeGeminiAudio(data: string, mimeType = "audio/wav"): {
|
||||||
|
buffer: Buffer;
|
||||||
|
format: TextToSpeechFormat;
|
||||||
|
mimeType: string;
|
||||||
|
} {
|
||||||
|
const normalizedMime = mimeType.toLowerCase();
|
||||||
|
const raw = Buffer.from(data, "base64");
|
||||||
|
|
||||||
|
if (normalizedMime.includes("mpeg") || normalizedMime.includes("mp3")) {
|
||||||
|
return {buffer: raw, format: "mp3", mimeType: "audio/mpeg"};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalizedMime.includes("wav") || raw.subarray(0, 4).toString("ascii") === "RIFF") {
|
||||||
|
return {buffer: raw, format: "wav", mimeType: "audio/wav"};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalizedMime.includes("flac")) {
|
||||||
|
return {buffer: raw, format: "flac", mimeType: "audio/flac"};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalizedMime.includes("opus")) {
|
||||||
|
return {buffer: raw, format: "opus", mimeType: "audio/opus"};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalizedMime.includes("aac")) {
|
||||||
|
return {buffer: raw, format: "aac", mimeType: "audio/aac"};
|
||||||
|
}
|
||||||
|
|
||||||
|
const sampleRate = Number(/rate=(\d+)/i.exec(mimeType)?.[1]) || 24_000;
|
||||||
|
return {
|
||||||
|
buffer: wrapPcm16InWav(raw, sampleRate, 1),
|
||||||
|
format: "wav",
|
||||||
|
mimeType: "audio/wav",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function wrapPcm16InWav(pcm: Buffer, sampleRate: number, channels: number): Buffer {
|
||||||
|
const bitsPerSample = 16;
|
||||||
|
const byteRate = sampleRate * channels * bitsPerSample / 8;
|
||||||
|
const blockAlign = channels * bitsPerSample / 8;
|
||||||
|
const header = Buffer.alloc(44);
|
||||||
|
|
||||||
|
header.write("RIFF", 0);
|
||||||
|
header.writeUInt32LE(36 + pcm.length, 4);
|
||||||
|
header.write("WAVE", 8);
|
||||||
|
header.write("fmt ", 12);
|
||||||
|
header.writeUInt32LE(16, 16);
|
||||||
|
header.writeUInt16LE(1, 20);
|
||||||
|
header.writeUInt16LE(channels, 22);
|
||||||
|
header.writeUInt32LE(sampleRate, 24);
|
||||||
|
header.writeUInt32LE(byteRate, 28);
|
||||||
|
header.writeUInt16LE(blockAlign, 32);
|
||||||
|
header.writeUInt16LE(bitsPerSample, 34);
|
||||||
|
header.write("data", 36);
|
||||||
|
header.writeUInt32LE(pcm.length, 40);
|
||||||
|
|
||||||
|
return Buffer.concat([header, pcm]);
|
||||||
|
}
|
||||||
|
|
||||||
|
function writeSpeechFile(params: SpeechFileParams): SynthesizedSpeech {
|
||||||
|
fs.mkdirSync(ttsCacheDir(), {recursive: true});
|
||||||
|
|
||||||
|
const fileName = `${params.provider.toLowerCase()}-tts-${Date.now()}-${randomUUID()}.${params.format}`;
|
||||||
|
const filePath = path.join(ttsCacheDir(), fileName);
|
||||||
|
fs.writeFileSync(filePath, params.buffer);
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: params.provider,
|
||||||
|
model: params.model,
|
||||||
|
voice: params.voice,
|
||||||
|
format: params.format,
|
||||||
|
mimeType: params.mimeType,
|
||||||
|
fileName,
|
||||||
|
path: filePath,
|
||||||
|
sizeBytes: params.buffer.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createSpeechUpload(speech: SynthesizedSpeech): FileOptions {
|
||||||
|
return new FileOptions(fs.createReadStream(speech.path), {
|
||||||
|
filename: speech.fileName,
|
||||||
|
contentType: speech.mimeType,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function destroyUpload(upload: FileOptions): void {
|
||||||
|
if ("destroy" in upload.file && typeof upload.file.destroy === "function") {
|
||||||
|
upload.file.destroy();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function sendSynthesizedSpeech(sourceMessage: Message, speech: SynthesizedSpeech): Promise<Message> {
|
||||||
|
if (speech.sizeBytes > TELEGRAM_FILE_LIMIT_BYTES) {
|
||||||
|
throw new Error(Environment.speechFileTooLargeText);
|
||||||
|
}
|
||||||
|
|
||||||
|
const caption = Environment.getTextToSpeechCaption(providerName(speech.provider), speech.model, speech.voice);
|
||||||
|
|
||||||
|
await enqueueTelegramApiCall(
|
||||||
|
() => bot.sendChatAction({
|
||||||
|
chat_id: sourceMessage.chat.id,
|
||||||
|
action: speech.format === "mp3" || speech.format === "opus" ? "upload_voice" : "upload_document",
|
||||||
|
}),
|
||||||
|
{method: "sendChatAction", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
|
||||||
|
).catch(logError);
|
||||||
|
|
||||||
|
let sent: Message;
|
||||||
|
if (speech.format === "mp3" || speech.format === "opus") {
|
||||||
|
try {
|
||||||
|
sent = await enqueueTelegramApiCall(
|
||||||
|
async () => {
|
||||||
|
const upload = createSpeechUpload(speech);
|
||||||
|
try {
|
||||||
|
return await bot.sendVoice({
|
||||||
|
chat_id: sourceMessage.chat.id,
|
||||||
|
voice: upload,
|
||||||
|
caption,
|
||||||
|
reply_parameters: {message_id: sourceMessage.message_id},
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
destroyUpload(upload);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{method: "sendVoice", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
|
logError(e);
|
||||||
|
sent = await sendSpeechDocument(sourceMessage, speech, caption);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sent = await sendSpeechDocument(sourceMessage, speech, caption);
|
||||||
|
}
|
||||||
|
|
||||||
|
await storeSpeechMessage(sent, sourceMessage, speech);
|
||||||
|
return sent;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sendSpeechDocument(sourceMessage: Message, speech: SynthesizedSpeech, caption: string): Promise<Message> {
|
||||||
|
return enqueueTelegramApiCall(
|
||||||
|
async () => {
|
||||||
|
const upload = createSpeechUpload(speech);
|
||||||
|
try {
|
||||||
|
return await bot.sendDocument({
|
||||||
|
chat_id: sourceMessage.chat.id,
|
||||||
|
document: upload,
|
||||||
|
caption,
|
||||||
|
reply_parameters: {message_id: sourceMessage.message_id},
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
destroyUpload(upload);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{method: "sendDocument", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function storeSpeechMessage(sent: Message, sourceMessage: Message, speech: SynthesizedSpeech): Promise<void> {
|
||||||
|
const file = sent.voice ?? sent.audio ?? sent.document;
|
||||||
|
const attachment: StoredAttachment = {
|
||||||
|
kind: "audio",
|
||||||
|
fileId: file?.file_id ?? speech.path,
|
||||||
|
fileUniqueId: file?.file_unique_id,
|
||||||
|
fileName: speech.fileName,
|
||||||
|
mimeType: speech.mimeType,
|
||||||
|
cachePath: speech.path,
|
||||||
|
};
|
||||||
|
|
||||||
|
const stored: StoredMessage = {
|
||||||
|
chatId: sent.chat.id,
|
||||||
|
id: sent.message_id,
|
||||||
|
replyToMessageId: sent.reply_to_message?.message_id ?? sourceMessage.message_id,
|
||||||
|
fromId: sent.from?.id ?? 0,
|
||||||
|
text: sent.caption ?? speech.fileName,
|
||||||
|
date: sent.date ?? Math.floor(Date.now() / 1000),
|
||||||
|
attachments: [attachment],
|
||||||
|
};
|
||||||
|
|
||||||
|
await MessageStore.put(stored);
|
||||||
|
}
|
||||||
@@ -0,0 +1,99 @@
|
|||||||
|
import {Message} from "typescript-telegram-bot-api";
|
||||||
|
import {Command} from "../base/command";
|
||||||
|
import {AiProvider} from "../model/ai-provider";
|
||||||
|
import {
|
||||||
|
isTranscribableAudioDownload,
|
||||||
|
resolveSpeechToTextProviderForUser,
|
||||||
|
transcribeSpeechDownloads,
|
||||||
|
} from "../ai/speech-to-text";
|
||||||
|
import {attachmentsToDownloadedFiles, cacheMessageAttachments} from "../ai/telegram-attachments";
|
||||||
|
import {MessageStore} from "../common/message-store";
|
||||||
|
import {StoredAttachment} from "../model/stored-attachment";
|
||||||
|
import {logError, replyToMessage} from "../util/utils";
|
||||||
|
import {Environment} from "../common/environment";
|
||||||
|
|
||||||
|
const TELEGRAM_LIMIT = 4096;
|
||||||
|
|
||||||
|
const PROVIDER_ALIASES = new Map<string, AiProvider>([
|
||||||
|
["openai", AiProvider.OPENAI],
|
||||||
|
["chatgpt", AiProvider.OPENAI],
|
||||||
|
["gpt", AiProvider.OPENAI],
|
||||||
|
["gemini", AiProvider.GEMINI],
|
||||||
|
["google", AiProvider.GEMINI],
|
||||||
|
["mistral", AiProvider.MISTRAL],
|
||||||
|
["ollama", AiProvider.OLLAMA],
|
||||||
|
]);
|
||||||
|
|
||||||
|
function parseProviderToken(token: string | undefined): AiProvider | undefined {
|
||||||
|
if (!token) return undefined;
|
||||||
|
return PROVIDER_ALIASES.get(token.toLowerCase().replace(/:$/, ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function collectStoredAttachments(msg: Message | undefined): Promise<StoredAttachment[]> {
|
||||||
|
if (!msg) return [];
|
||||||
|
|
||||||
|
const stored = await MessageStore.get(msg.chat.id, msg.message_id);
|
||||||
|
if (stored?.attachments?.length) return stored.attachments;
|
||||||
|
|
||||||
|
return cacheMessageAttachments(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function collectAudioDownloads(msg: Message) {
|
||||||
|
const attachments = [
|
||||||
|
...await collectStoredAttachments(msg),
|
||||||
|
...await collectStoredAttachments(msg.reply_to_message),
|
||||||
|
];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
return attachmentsToDownloadedFiles(attachments)
|
||||||
|
.filter(isTranscribableAudioDownload)
|
||||||
|
.filter(download => {
|
||||||
|
const key = `${download.fileId}:${download.path}`;
|
||||||
|
if (seen.has(key)) return false;
|
||||||
|
seen.add(key);
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export class SpeechToText extends Command {
|
||||||
|
command = ["stt", "transcribe"];
|
||||||
|
argsMode = "optional" as const;
|
||||||
|
|
||||||
|
title = Environment.commandTitles.speechToText;
|
||||||
|
description = Environment.commandDescriptions.speechToText;
|
||||||
|
|
||||||
|
async execute(msg: Message, match?: RegExpExecArray | null): Promise<void> {
|
||||||
|
if (!msg.from) return;
|
||||||
|
|
||||||
|
const args = match?.[3]?.trim() ?? "";
|
||||||
|
const explicitProvider = parseProviderToken(args.split(/\s+/)[0]);
|
||||||
|
const downloads = await collectAudioDownloads(msg);
|
||||||
|
|
||||||
|
if (!downloads.length) {
|
||||||
|
await replyToMessage({
|
||||||
|
message: msg,
|
||||||
|
text: Environment.speechToTextInstructionText,
|
||||||
|
}).catch(logError);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resolved = await resolveSpeechToTextProviderForUser(msg.from.id, explicitProvider, {
|
||||||
|
allowFallback: !explicitProvider,
|
||||||
|
});
|
||||||
|
const transcript = await transcribeSpeechDownloads(resolved.provider, downloads);
|
||||||
|
const text = transcript.trim() || Environment.speechToTextEmptyResultText;
|
||||||
|
|
||||||
|
await replyToMessage({
|
||||||
|
message: msg,
|
||||||
|
text: text.length > TELEGRAM_LIMIT ? text.slice(0, TELEGRAM_LIMIT - 3) + "..." : text,
|
||||||
|
}).catch(logError);
|
||||||
|
} catch (e) {
|
||||||
|
logError(e);
|
||||||
|
await replyToMessage({
|
||||||
|
message: msg,
|
||||||
|
text: e instanceof Error ? e.message : String(e),
|
||||||
|
}).catch(logError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
import {Message} from "typescript-telegram-bot-api";
|
||||||
|
import {Command} from "../base/command";
|
||||||
|
import {AiProvider} from "../model/ai-provider";
|
||||||
|
import {
|
||||||
|
resolveTextToSpeechProviderForUser,
|
||||||
|
sendSynthesizedSpeech,
|
||||||
|
synthesizeSpeech,
|
||||||
|
} from "../ai/text-to-speech";
|
||||||
|
import {logError, replyToMessage} from "../util/utils";
|
||||||
|
import {Environment} from "../common/environment";
|
||||||
|
|
||||||
|
const PROVIDER_ALIASES = new Map<string, AiProvider>([
|
||||||
|
["openai", AiProvider.OPENAI],
|
||||||
|
["chatgpt", AiProvider.OPENAI],
|
||||||
|
["gpt", AiProvider.OPENAI],
|
||||||
|
["gemini", AiProvider.GEMINI],
|
||||||
|
["google", AiProvider.GEMINI],
|
||||||
|
["mistral", AiProvider.MISTRAL],
|
||||||
|
["ollama", AiProvider.OLLAMA],
|
||||||
|
]);
|
||||||
|
|
||||||
|
function parseProviderToken(token: string | undefined): AiProvider | undefined {
|
||||||
|
if (!token) return undefined;
|
||||||
|
return PROVIDER_ALIASES.get(token.toLowerCase().replace(/:$/, ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
export class TextToSpeech extends Command {
|
||||||
|
command = ["tts", "say", "voice"];
|
||||||
|
argsMode = "optional" as const;
|
||||||
|
|
||||||
|
title = Environment.commandTitles.textToSpeech;
|
||||||
|
description = Environment.commandDescriptions.textToSpeech;
|
||||||
|
|
||||||
|
async execute(msg: Message, match?: RegExpExecArray | null): Promise<void> {
|
||||||
|
if (!msg.from) return;
|
||||||
|
|
||||||
|
const args = match?.[3]?.trim() ?? "";
|
||||||
|
const replyText = (msg.reply_to_message?.text ?? msg.reply_to_message?.caption ?? "").trim();
|
||||||
|
const [firstToken = "", ...restTokens] = args.split(/\s+/);
|
||||||
|
const explicitProvider = parseProviderToken(firstToken);
|
||||||
|
const text = explicitProvider
|
||||||
|
? (restTokens.join(" ").trim() || replyText)
|
||||||
|
: (args || replyText);
|
||||||
|
|
||||||
|
if (!text.trim()) {
|
||||||
|
await replyToMessage({
|
||||||
|
message: msg,
|
||||||
|
text: Environment.textToSpeechInstructionText,
|
||||||
|
}).catch(logError);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resolved = await resolveTextToSpeechProviderForUser(msg.from.id, explicitProvider);
|
||||||
|
const speech = await synthesizeSpeech({provider: resolved.provider, text});
|
||||||
|
await sendSynthesizedSpeech(msg, speech);
|
||||||
|
} catch (e) {
|
||||||
|
logError(e);
|
||||||
|
await replyToMessage({
|
||||||
|
message: msg,
|
||||||
|
text: e instanceof Error ? e.message : String(e),
|
||||||
|
}).catch(logError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user