ai: add RAG, speech-to-text and text-to-speech

This commit is contained in:
2026-05-10 22:53:07 +03:00
parent 355ae8e5da
commit 1b94760b21
6 changed files with 2223 additions and 0 deletions
+1360
View File
File diff suppressed because it is too large Load Diff
+8
View File
@@ -0,0 +1,8 @@
const OLLAMA_SPEECH_TO_TEXT_MODELS = new Set([
"gemma4:e2b",
"gemma4:e4b",
]);
export function isOllamaSpeechToTextModel(model: string | undefined | null): boolean {
return !!model && OLLAMA_SPEECH_TO_TEXT_MODELS.has(model.trim().toLowerCase());
}
+256
View File
@@ -0,0 +1,256 @@
import fs, {openAsBlob} from "node:fs";
import {AiProvider} from "../model/ai-provider";
import {
getAvailableAiProviderChoices,
getProviderChoiceLabel,
normalizeAiProviderChoice,
resolveEffectiveAiProviderForUser,
} from "../common/user-ai-settings";
import {AiDownloadedFile} from "./telegram-attachments";
import {isOllamaSpeechToTextModel} from "./speech-to-text-models";
import {
createGoogleGenAiClient,
createMistralClient,
createOllamaClient,
createOpenAiClient,
resolveAiRuntimeTarget
} from "./ai-runtime-target";
import {Environment} from "../common/environment";
export type TranscribedSpeech = {
provider: AiProvider;
model: string;
text: string;
fileName: string;
};
export type SpeechToTextRequest = {
provider: AiProvider;
audio: AiDownloadedFile;
signal?: AbortSignal;
};
export type SpeechToTextProviderResolution = {
provider: AiProvider;
fallback: boolean;
};
export type SpeechToTextResolveOptions = {
allowFallback?: boolean;
};
function providerName(provider: AiProvider): string {
return getProviderChoiceLabel(provider);
}
export function isTranscribableAudioDownload(download: AiDownloadedFile): boolean {
if (download.kind === "audio") return true;
return download.kind === "video-note" && (download.mimeType?.startsWith("audio/") || download.path.toLowerCase().endsWith(".wav"));
}
export function isSpeechToTextConfigured(provider: AiProvider): boolean {
switch (provider) {
case AiProvider.OPENAI:
const openAiTarget = resolveAiRuntimeTarget(provider, "speechToText");
return !!openAiTarget.apiKey && !!openAiTarget.model;
case AiProvider.GEMINI:
const geminiTarget = resolveAiRuntimeTarget(provider, "speechToText");
return !!geminiTarget.apiKey && !!geminiTarget.model;
case AiProvider.MISTRAL:
const mistralTarget = resolveAiRuntimeTarget(provider, "speechToText");
return !!mistralTarget.apiKey && !!mistralTarget.model;
case AiProvider.OLLAMA:
const ollamaTarget = resolveAiRuntimeTarget(provider, "speechToText");
return !!ollamaTarget.baseUrl && isOllamaSpeechToTextModel(ollamaTarget.model);
}
}
export async function resolveSpeechToTextProviderForUser(
userId: number,
preferredProvider?: AiProvider,
options: SpeechToTextResolveOptions = {},
): Promise<SpeechToTextProviderResolution> {
const allowFallback = options.allowFallback ?? true;
const availableChoices = getAvailableAiProviderChoices(userId);
const allowedProviders = availableChoices
.map(choice => normalizeAiProviderChoice(choice))
.filter((choice): choice is AiProvider => !!choice && choice !== "DEFAULT");
if (preferredProvider) {
if (!allowedProviders.includes(preferredProvider)) {
throw new Error(Environment.getProviderNotAvailableForAccessText(providerName(preferredProvider)));
}
if (isSpeechToTextConfigured(preferredProvider)) {
return {provider: preferredProvider, fallback: false};
}
if (!allowFallback) {
throw new Error(Environment.getProviderSpeechToTextUnsupportedText(providerName(preferredProvider)));
}
}
const effectiveProvider = await resolveEffectiveAiProviderForUser(userId);
if (isSpeechToTextConfigured(effectiveProvider)) {
return {
provider: effectiveProvider,
fallback: preferredProvider !== undefined && preferredProvider !== effectiveProvider
};
}
const fallbackProvider = allowedProviders.find(isSpeechToTextConfigured);
if (!fallbackProvider) {
throw new Error(Environment.noSpeechToTextProviderForAccessText);
}
return {provider: fallbackProvider, fallback: true};
}
export async function transcribeSpeech(request: SpeechToTextRequest): Promise<TranscribedSpeech> {
if (request.signal?.aborted) throw new Error("Aborted");
switch (request.provider) {
case AiProvider.OPENAI:
return transcribeOpenAiSpeech(request.audio, request.signal);
case AiProvider.GEMINI:
return transcribeGeminiSpeech(request.audio, request.signal);
case AiProvider.MISTRAL:
return transcribeMistralSpeech(request.audio, request.signal);
case AiProvider.OLLAMA:
return transcribeOllamaSpeech(request.audio, request.signal);
}
}
export async function transcribeSpeechDownloads(provider: AiProvider, downloads: AiDownloadedFile[], signal?: AbortSignal): Promise<string> {
const audios = downloads.filter(isTranscribableAudioDownload);
const transcriptions: string[] = [];
for (const [index, audio] of audios.entries()) {
if (signal?.aborted) throw new Error("Aborted");
const result = await transcribeSpeech({provider, audio, signal});
const text = result.text.trim();
if (!text) continue;
transcriptions.push(audios.length > 1
? `[${index + 1}. ${audio.fileName}]\n${text}`
: text);
}
return transcriptions.join("\n\n").trim();
}
async function transcribeOpenAiSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
const target = resolveAiRuntimeTarget(AiProvider.OPENAI, "speechToText");
const openAi = createOpenAiClient(target);
const file = fs.createReadStream(audio.path);
try {
const result = await openAi.audio.transcriptions.create({
file,
model: target.model,
}, {signal});
return {
provider: AiProvider.OPENAI,
model: target.model,
text: result.text || "",
fileName: audio.fileName,
};
} finally {
file.destroy();
}
}
async function transcribeMistralSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
const target = resolveAiRuntimeTarget(AiProvider.MISTRAL, "speechToText");
const mistralAi = createMistralClient(target);
const result = await mistralAi.audio.transcriptions.complete({
model: target.model,
file: await openAsBlob(audio.path),
}, {signal});
return {
provider: AiProvider.MISTRAL,
model: target.model,
text: result.text || "",
fileName: audio.fileName,
};
}
async function transcribeGeminiSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
const target = resolveAiRuntimeTarget(AiProvider.GEMINI, "speechToText");
const geminiAi = createGoogleGenAiClient(target);
const response = await geminiAi.models.generateContent({
model: target.model,
contents: [{
role: "user",
parts: [
{text: "Transcribe the attached audio verbatim. Reply only with the transcription text. Do not answer the speaker."},
{
inlineData: {
data: audio.buffer.toString("base64"),
mimeType: audio.mimeType || "audio/wav",
}
}
]
}],
config: {
temperature: 0,
abortSignal: signal,
},
});
return {
provider: AiProvider.GEMINI,
model: target.model,
text: collectGeminiText(response),
fileName: audio.fileName,
};
}
async function transcribeOllamaSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
if (signal?.aborted) throw new Error("Aborted");
const target = resolveAiRuntimeTarget(AiProvider.OLLAMA, "speechToText");
const model = target.model;
if (!isOllamaSpeechToTextModel(model)) {
throw new Error(Environment.ollamaSpeechToTextModelRequiredText);
}
const ollama = createOllamaClient(target);
const response = await ollama.chat({
model,
stream: false,
think: false,
messages: [{
role: "user",
content: "Transcribe the attached audio verbatim. Reply only with the transcription text. Do not answer the speaker.",
images: [audio.buffer.toString("base64")],
}],
options: {
temperature: 0,
},
});
return {
provider: AiProvider.OLLAMA,
model,
text: response?.message?.content || "",
fileName: audio.fileName,
};
}
function collectGeminiText(response: any): string {
if (typeof response?.text === "string") return response.text;
const candidates = response?.candidates ?? [];
const candidateText = candidates
.flatMap((candidate: any) => candidate?.content?.parts ?? [])
.map((part: any) => part?.text ?? "")
.join("");
if (candidateText.trim()) return candidateText;
return (response?.candidates ?? [])
.map((output: any) => typeof output === "string" ? output : output?.content?.parts?.[0]?.text ?? "")
.join("");
}
+435
View File
@@ -0,0 +1,435 @@
import fs from "node:fs";
import path from "node:path";
import {randomUUID} from "node:crypto";
import {FileOptions, Message} from "typescript-telegram-bot-api";
import {AiProvider} from "../model/ai-provider";
import {Environment} from "../common/environment";
import {bot} from "../index";
import {
getAvailableAiProviderChoices,
getProviderChoiceLabel,
normalizeAiProviderChoice,
resolveEffectiveAiProviderForUser,
} from "../common/user-ai-settings";
import {enqueueTelegramApiCall} from "../util/telegram-api-queue";
import {MessageStore} from "../common/message-store";
import {StoredAttachment} from "../model/stored-attachment";
import {StoredMessage} from "../model/stored-message";
import {logError} from "../util/utils";
import {SpeechRequest} from "@mistralai/mistralai/models/components";
import {createGoogleGenAiClient, createMistralClient, createOpenAiClient, resolveAiRuntimeTarget} from "./ai-runtime-target";
const MAX_TTS_TEXT_CHARS = 4096;
const TELEGRAM_FILE_LIMIT_BYTES = 50 * 1024 * 1024;
export type TextToSpeechFormat = "mp3" | "wav" | "flac" | "opus" | "aac" | "pcm";
export type SynthesizedSpeech = {
provider: AiProvider;
model: string;
voice?: string;
format: TextToSpeechFormat;
mimeType: string;
fileName: string;
path: string;
sizeBytes: number;
};
export type TextToSpeechRequest = {
provider: AiProvider;
text: string;
voice?: string;
};
export type TextToSpeechProviderResolution = {
provider: AiProvider;
fallback: boolean;
};
type SpeechFileParams = Omit<SynthesizedSpeech, "fileName" | "path" | "sizeBytes"> & {
buffer: Buffer;
};
function ttsCacheDir(): string {
return path.join(Environment.DATA_PATH, "cache", "audio");
}
function providerName(provider: AiProvider): string {
return getProviderChoiceLabel(provider);
}
function assertText(text: string): string {
const normalized = text.trim();
if (!normalized) {
throw new Error(Environment.noTextToSynthesizeText);
}
if (normalized.length > MAX_TTS_TEXT_CHARS) {
throw new Error(Environment.getTextToSpeechTooLongText(normalized.length, MAX_TTS_TEXT_CHARS));
}
return normalized;
}
export function isTextToSpeechConfigured(provider: AiProvider): boolean {
switch (provider) {
case AiProvider.OPENAI:
const openAiTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
return !!openAiTarget.apiKey && !!openAiTarget.model;
case AiProvider.GEMINI:
const geminiTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
return !!geminiTarget.apiKey && !!geminiTarget.model;
case AiProvider.MISTRAL:
const mistralTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
return !!mistralTarget.apiKey && !!mistralTarget.model;
case AiProvider.OLLAMA:
return false;
}
}
export async function resolveTextToSpeechProviderForUser(
userId: number,
explicitProvider?: AiProvider,
): Promise<TextToSpeechProviderResolution> {
const availableChoices = getAvailableAiProviderChoices(userId);
const allowedProviders = availableChoices
.map(choice => normalizeAiProviderChoice(choice))
.filter((choice): choice is AiProvider => !!choice && choice !== "DEFAULT");
if (explicitProvider) {
if (!allowedProviders.includes(explicitProvider)) {
throw new Error(Environment.getProviderNotAvailableForAccessText(providerName(explicitProvider)));
}
if (!isTextToSpeechConfigured(explicitProvider)) {
throw new Error(Environment.getProviderTextToSpeechUnsupportedText(providerName(explicitProvider)));
}
return {provider: explicitProvider, fallback: false};
}
const effectiveProvider = await resolveEffectiveAiProviderForUser(userId);
if (isTextToSpeechConfigured(effectiveProvider)) {
return {provider: effectiveProvider, fallback: false};
}
const fallbackProvider = allowedProviders.find(isTextToSpeechConfigured);
if (!fallbackProvider) {
throw new Error(Environment.noTextToSpeechProviderForAccessText);
}
return {provider: fallbackProvider, fallback: true};
}
export async function synthesizeSpeech(request: TextToSpeechRequest): Promise<SynthesizedSpeech> {
const text = assertText(request.text);
switch (request.provider) {
case AiProvider.OPENAI:
return synthesizeOpenAiSpeech(text, request.voice);
case AiProvider.GEMINI:
return synthesizeGeminiSpeech(text, request.voice);
case AiProvider.MISTRAL:
return synthesizeMistralSpeech(text, request.voice);
case AiProvider.OLLAMA:
throw new Error(Environment.ollamaTextToSpeechUnsupportedText);
}
}
async function synthesizeOpenAiSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
const target = resolveAiRuntimeTarget(AiProvider.OPENAI, "textToSpeech");
const openAi = createOpenAiClient(target);
const response = await openAi.audio.speech.create({
model: target.model,
voice: voice || Environment.OPENAI_TTS_VOICE,
input: text,
response_format: "mp3",
instructions: Environment.OPENAI_TTS_INSTRUCTIONS,
});
const buffer = Buffer.from(await response.arrayBuffer());
return writeSpeechFile({
provider: AiProvider.OPENAI,
model: target.model,
voice: voice || Environment.OPENAI_TTS_VOICE,
buffer,
format: "mp3",
mimeType: "audio/mpeg",
});
}
async function synthesizeMistralSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
const target = resolveAiRuntimeTarget(AiProvider.MISTRAL, "textToSpeech");
const mistralAi = createMistralClient(target);
const request: SpeechRequest = {
input: text,
responseFormat: "mp3"
// stream: false,
};
if (target.model) request.model = target.model;
if (voice || Environment.MISTRAL_TTS_VOICE_ID) request.voiceId = voice || Environment.MISTRAL_TTS_VOICE_ID;
const response: any = await mistralAi.audio.speech.complete(request);
const audioData = response?.audioData ?? response?.audio_data;
if (typeof audioData !== "string" || !audioData.trim()) {
throw new Error(Environment.mistralTtsNoAudioDataText);
}
const buffer = Buffer.from(audioData, "base64");
return writeSpeechFile({
provider: AiProvider.MISTRAL,
model: target.model || "mistral speech",
voice: voice || Environment.MISTRAL_TTS_VOICE_ID,
buffer,
format: "mp3",
mimeType: "audio/mpeg",
});
}
async function synthesizeGeminiSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
const target = resolveAiRuntimeTarget(AiProvider.GEMINI, "textToSpeech");
const geminiAi = createGoogleGenAiClient(target);
const response: any = await geminiAi.models.generateContent({
model: target.model,
contents: text,
config: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: voice || Environment.GEMINI_TTS_VOICE,
},
},
},
},
});
const audioPart = findGeminiAudioPart(response);
if (!audioPart) {
throw new Error(Environment.geminiTextToSpeechUnsupportedText);
}
const decoded = decodeGeminiAudio(audioPart.data, audioPart.mimeType);
return writeSpeechFile({
provider: AiProvider.GEMINI,
model: target.model,
voice: voice || Environment.GEMINI_TTS_VOICE,
buffer: decoded.buffer,
format: decoded.format,
mimeType: decoded.mimeType,
});
}
function findGeminiAudioPart(value: unknown): { data: string; mimeType?: string } | null {
if (!value || typeof value !== "object") return null;
const record = value as Record<string, unknown>;
const inlineData = record.inlineData ?? record.inline_data;
if (inlineData && typeof inlineData === "object") {
const inlineRecord = inlineData as Record<string, unknown>;
const data = inlineRecord.data;
const mimeType = inlineRecord.mimeType ?? inlineRecord.mime_type;
if (typeof data === "string" && (!mimeType || String(mimeType).startsWith("audio/"))) {
return {data, mimeType: typeof mimeType === "string" ? mimeType : undefined};
}
}
for (const child of Object.values(record)) {
if (Array.isArray(child)) {
for (const item of child) {
const found = findGeminiAudioPart(item);
if (found) return found;
}
} else if (child && typeof child === "object") {
const found = findGeminiAudioPart(child);
if (found) return found;
}
}
return null;
}
function decodeGeminiAudio(data: string, mimeType = "audio/wav"): {
buffer: Buffer;
format: TextToSpeechFormat;
mimeType: string;
} {
const normalizedMime = mimeType.toLowerCase();
const raw = Buffer.from(data, "base64");
if (normalizedMime.includes("mpeg") || normalizedMime.includes("mp3")) {
return {buffer: raw, format: "mp3", mimeType: "audio/mpeg"};
}
if (normalizedMime.includes("wav") || raw.subarray(0, 4).toString("ascii") === "RIFF") {
return {buffer: raw, format: "wav", mimeType: "audio/wav"};
}
if (normalizedMime.includes("flac")) {
return {buffer: raw, format: "flac", mimeType: "audio/flac"};
}
if (normalizedMime.includes("opus")) {
return {buffer: raw, format: "opus", mimeType: "audio/opus"};
}
if (normalizedMime.includes("aac")) {
return {buffer: raw, format: "aac", mimeType: "audio/aac"};
}
const sampleRate = Number(/rate=(\d+)/i.exec(mimeType)?.[1]) || 24_000;
return {
buffer: wrapPcm16InWav(raw, sampleRate, 1),
format: "wav",
mimeType: "audio/wav",
};
}
function wrapPcm16InWav(pcm: Buffer, sampleRate: number, channels: number): Buffer {
const bitsPerSample = 16;
const byteRate = sampleRate * channels * bitsPerSample / 8;
const blockAlign = channels * bitsPerSample / 8;
const header = Buffer.alloc(44);
header.write("RIFF", 0);
header.writeUInt32LE(36 + pcm.length, 4);
header.write("WAVE", 8);
header.write("fmt ", 12);
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20);
header.writeUInt16LE(channels, 22);
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(byteRate, 28);
header.writeUInt16LE(blockAlign, 32);
header.writeUInt16LE(bitsPerSample, 34);
header.write("data", 36);
header.writeUInt32LE(pcm.length, 40);
return Buffer.concat([header, pcm]);
}
function writeSpeechFile(params: SpeechFileParams): SynthesizedSpeech {
fs.mkdirSync(ttsCacheDir(), {recursive: true});
const fileName = `${params.provider.toLowerCase()}-tts-${Date.now()}-${randomUUID()}.${params.format}`;
const filePath = path.join(ttsCacheDir(), fileName);
fs.writeFileSync(filePath, params.buffer);
return {
provider: params.provider,
model: params.model,
voice: params.voice,
format: params.format,
mimeType: params.mimeType,
fileName,
path: filePath,
sizeBytes: params.buffer.length,
};
}
function createSpeechUpload(speech: SynthesizedSpeech): FileOptions {
return new FileOptions(fs.createReadStream(speech.path), {
filename: speech.fileName,
contentType: speech.mimeType,
});
}
function destroyUpload(upload: FileOptions): void {
if ("destroy" in upload.file && typeof upload.file.destroy === "function") {
upload.file.destroy();
}
}
export async function sendSynthesizedSpeech(sourceMessage: Message, speech: SynthesizedSpeech): Promise<Message> {
if (speech.sizeBytes > TELEGRAM_FILE_LIMIT_BYTES) {
throw new Error(Environment.speechFileTooLargeText);
}
const caption = Environment.getTextToSpeechCaption(providerName(speech.provider), speech.model, speech.voice);
await enqueueTelegramApiCall(
() => bot.sendChatAction({
chat_id: sourceMessage.chat.id,
action: speech.format === "mp3" || speech.format === "opus" ? "upload_voice" : "upload_document",
}),
{method: "sendChatAction", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
).catch(logError);
let sent: Message;
if (speech.format === "mp3" || speech.format === "opus") {
try {
sent = await enqueueTelegramApiCall(
async () => {
const upload = createSpeechUpload(speech);
try {
return await bot.sendVoice({
chat_id: sourceMessage.chat.id,
voice: upload,
caption,
reply_parameters: {message_id: sourceMessage.message_id},
});
} finally {
destroyUpload(upload);
}
},
{method: "sendVoice", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
);
} catch (e) {
logError(e);
sent = await sendSpeechDocument(sourceMessage, speech, caption);
}
} else {
sent = await sendSpeechDocument(sourceMessage, speech, caption);
}
await storeSpeechMessage(sent, sourceMessage, speech);
return sent;
}
async function sendSpeechDocument(sourceMessage: Message, speech: SynthesizedSpeech, caption: string): Promise<Message> {
return enqueueTelegramApiCall(
async () => {
const upload = createSpeechUpload(speech);
try {
return await bot.sendDocument({
chat_id: sourceMessage.chat.id,
document: upload,
caption,
reply_parameters: {message_id: sourceMessage.message_id},
});
} finally {
destroyUpload(upload);
}
},
{method: "sendDocument", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
);
}
async function storeSpeechMessage(sent: Message, sourceMessage: Message, speech: SynthesizedSpeech): Promise<void> {
const file = sent.voice ?? sent.audio ?? sent.document;
const attachment: StoredAttachment = {
kind: "audio",
fileId: file?.file_id ?? speech.path,
fileUniqueId: file?.file_unique_id,
fileName: speech.fileName,
mimeType: speech.mimeType,
cachePath: speech.path,
};
const stored: StoredMessage = {
chatId: sent.chat.id,
id: sent.message_id,
replyToMessageId: sent.reply_to_message?.message_id ?? sourceMessage.message_id,
fromId: sent.from?.id ?? 0,
text: sent.caption ?? speech.fileName,
date: sent.date ?? Math.floor(Date.now() / 1000),
attachments: [attachment],
};
await MessageStore.put(stored);
}
+99
View File
@@ -0,0 +1,99 @@
import {Message} from "typescript-telegram-bot-api";
import {Command} from "../base/command";
import {AiProvider} from "../model/ai-provider";
import {
isTranscribableAudioDownload,
resolveSpeechToTextProviderForUser,
transcribeSpeechDownloads,
} from "../ai/speech-to-text";
import {attachmentsToDownloadedFiles, cacheMessageAttachments} from "../ai/telegram-attachments";
import {MessageStore} from "../common/message-store";
import {StoredAttachment} from "../model/stored-attachment";
import {logError, replyToMessage} from "../util/utils";
import {Environment} from "../common/environment";
const TELEGRAM_LIMIT = 4096;
const PROVIDER_ALIASES = new Map<string, AiProvider>([
["openai", AiProvider.OPENAI],
["chatgpt", AiProvider.OPENAI],
["gpt", AiProvider.OPENAI],
["gemini", AiProvider.GEMINI],
["google", AiProvider.GEMINI],
["mistral", AiProvider.MISTRAL],
["ollama", AiProvider.OLLAMA],
]);
function parseProviderToken(token: string | undefined): AiProvider | undefined {
if (!token) return undefined;
return PROVIDER_ALIASES.get(token.toLowerCase().replace(/:$/, ""));
}
async function collectStoredAttachments(msg: Message | undefined): Promise<StoredAttachment[]> {
if (!msg) return [];
const stored = await MessageStore.get(msg.chat.id, msg.message_id);
if (stored?.attachments?.length) return stored.attachments;
return cacheMessageAttachments(msg);
}
async function collectAudioDownloads(msg: Message) {
const attachments = [
...await collectStoredAttachments(msg),
...await collectStoredAttachments(msg.reply_to_message),
];
const seen = new Set<string>();
return attachmentsToDownloadedFiles(attachments)
.filter(isTranscribableAudioDownload)
.filter(download => {
const key = `${download.fileId}:${download.path}`;
if (seen.has(key)) return false;
seen.add(key);
return true;
});
}
export class SpeechToText extends Command {
command = ["stt", "transcribe"];
argsMode = "optional" as const;
title = Environment.commandTitles.speechToText;
description = Environment.commandDescriptions.speechToText;
async execute(msg: Message, match?: RegExpExecArray | null): Promise<void> {
if (!msg.from) return;
const args = match?.[3]?.trim() ?? "";
const explicitProvider = parseProviderToken(args.split(/\s+/)[0]);
const downloads = await collectAudioDownloads(msg);
if (!downloads.length) {
await replyToMessage({
message: msg,
text: Environment.speechToTextInstructionText,
}).catch(logError);
return;
}
try {
const resolved = await resolveSpeechToTextProviderForUser(msg.from.id, explicitProvider, {
allowFallback: !explicitProvider,
});
const transcript = await transcribeSpeechDownloads(resolved.provider, downloads);
const text = transcript.trim() || Environment.speechToTextEmptyResultText;
await replyToMessage({
message: msg,
text: text.length > TELEGRAM_LIMIT ? text.slice(0, TELEGRAM_LIMIT - 3) + "..." : text,
}).catch(logError);
} catch (e) {
logError(e);
await replyToMessage({
message: msg,
text: e instanceof Error ? e.message : String(e),
}).catch(logError);
}
}
}
+65
View File
@@ -0,0 +1,65 @@
import {Message} from "typescript-telegram-bot-api";
import {Command} from "../base/command";
import {AiProvider} from "../model/ai-provider";
import {
resolveTextToSpeechProviderForUser,
sendSynthesizedSpeech,
synthesizeSpeech,
} from "../ai/text-to-speech";
import {logError, replyToMessage} from "../util/utils";
import {Environment} from "../common/environment";
const PROVIDER_ALIASES = new Map<string, AiProvider>([
["openai", AiProvider.OPENAI],
["chatgpt", AiProvider.OPENAI],
["gpt", AiProvider.OPENAI],
["gemini", AiProvider.GEMINI],
["google", AiProvider.GEMINI],
["mistral", AiProvider.MISTRAL],
["ollama", AiProvider.OLLAMA],
]);
function parseProviderToken(token: string | undefined): AiProvider | undefined {
if (!token) return undefined;
return PROVIDER_ALIASES.get(token.toLowerCase().replace(/:$/, ""));
}
export class TextToSpeech extends Command {
command = ["tts", "say", "voice"];
argsMode = "optional" as const;
title = Environment.commandTitles.textToSpeech;
description = Environment.commandDescriptions.textToSpeech;
async execute(msg: Message, match?: RegExpExecArray | null): Promise<void> {
if (!msg.from) return;
const args = match?.[3]?.trim() ?? "";
const replyText = (msg.reply_to_message?.text ?? msg.reply_to_message?.caption ?? "").trim();
const [firstToken = "", ...restTokens] = args.split(/\s+/);
const explicitProvider = parseProviderToken(firstToken);
const text = explicitProvider
? (restTokens.join(" ").trim() || replyText)
: (args || replyText);
if (!text.trim()) {
await replyToMessage({
message: msg,
text: Environment.textToSpeechInstructionText,
}).catch(logError);
return;
}
try {
const resolved = await resolveTextToSpeechProviderForUser(msg.from.id, explicitProvider);
const speech = await synthesizeSpeech({provider: resolved.provider, text});
await sendSynthesizedSpeech(msg, speech);
} catch (e) {
logError(e);
await replyToMessage({
message: msg,
text: e instanceof Error ? e.message : String(e),
}).catch(logError);
}
}
}