ai: add RAG, speech-to-text and text-to-speech
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,8 @@
|
||||
const OLLAMA_SPEECH_TO_TEXT_MODELS = new Set([
|
||||
"gemma4:e2b",
|
||||
"gemma4:e4b",
|
||||
]);
|
||||
|
||||
export function isOllamaSpeechToTextModel(model: string | undefined | null): boolean {
|
||||
return !!model && OLLAMA_SPEECH_TO_TEXT_MODELS.has(model.trim().toLowerCase());
|
||||
}
|
||||
@@ -0,0 +1,256 @@
|
||||
import fs, {openAsBlob} from "node:fs";
|
||||
import {AiProvider} from "../model/ai-provider";
|
||||
import {
|
||||
getAvailableAiProviderChoices,
|
||||
getProviderChoiceLabel,
|
||||
normalizeAiProviderChoice,
|
||||
resolveEffectiveAiProviderForUser,
|
||||
} from "../common/user-ai-settings";
|
||||
import {AiDownloadedFile} from "./telegram-attachments";
|
||||
import {isOllamaSpeechToTextModel} from "./speech-to-text-models";
|
||||
import {
|
||||
createGoogleGenAiClient,
|
||||
createMistralClient,
|
||||
createOllamaClient,
|
||||
createOpenAiClient,
|
||||
resolveAiRuntimeTarget
|
||||
} from "./ai-runtime-target";
|
||||
import {Environment} from "../common/environment";
|
||||
|
||||
export type TranscribedSpeech = {
|
||||
provider: AiProvider;
|
||||
model: string;
|
||||
text: string;
|
||||
fileName: string;
|
||||
};
|
||||
|
||||
export type SpeechToTextRequest = {
|
||||
provider: AiProvider;
|
||||
audio: AiDownloadedFile;
|
||||
signal?: AbortSignal;
|
||||
};
|
||||
|
||||
export type SpeechToTextProviderResolution = {
|
||||
provider: AiProvider;
|
||||
fallback: boolean;
|
||||
};
|
||||
|
||||
export type SpeechToTextResolveOptions = {
|
||||
allowFallback?: boolean;
|
||||
};
|
||||
|
||||
function providerName(provider: AiProvider): string {
|
||||
return getProviderChoiceLabel(provider);
|
||||
}
|
||||
|
||||
export function isTranscribableAudioDownload(download: AiDownloadedFile): boolean {
|
||||
if (download.kind === "audio") return true;
|
||||
return download.kind === "video-note" && (download.mimeType?.startsWith("audio/") || download.path.toLowerCase().endsWith(".wav"));
|
||||
}
|
||||
|
||||
export function isSpeechToTextConfigured(provider: AiProvider): boolean {
|
||||
switch (provider) {
|
||||
case AiProvider.OPENAI:
|
||||
const openAiTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||
return !!openAiTarget.apiKey && !!openAiTarget.model;
|
||||
case AiProvider.GEMINI:
|
||||
const geminiTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||
return !!geminiTarget.apiKey && !!geminiTarget.model;
|
||||
case AiProvider.MISTRAL:
|
||||
const mistralTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||
return !!mistralTarget.apiKey && !!mistralTarget.model;
|
||||
case AiProvider.OLLAMA:
|
||||
const ollamaTarget = resolveAiRuntimeTarget(provider, "speechToText");
|
||||
return !!ollamaTarget.baseUrl && isOllamaSpeechToTextModel(ollamaTarget.model);
|
||||
}
|
||||
}
|
||||
|
||||
export async function resolveSpeechToTextProviderForUser(
|
||||
userId: number,
|
||||
preferredProvider?: AiProvider,
|
||||
options: SpeechToTextResolveOptions = {},
|
||||
): Promise<SpeechToTextProviderResolution> {
|
||||
const allowFallback = options.allowFallback ?? true;
|
||||
const availableChoices = getAvailableAiProviderChoices(userId);
|
||||
const allowedProviders = availableChoices
|
||||
.map(choice => normalizeAiProviderChoice(choice))
|
||||
.filter((choice): choice is AiProvider => !!choice && choice !== "DEFAULT");
|
||||
|
||||
if (preferredProvider) {
|
||||
if (!allowedProviders.includes(preferredProvider)) {
|
||||
throw new Error(Environment.getProviderNotAvailableForAccessText(providerName(preferredProvider)));
|
||||
}
|
||||
|
||||
if (isSpeechToTextConfigured(preferredProvider)) {
|
||||
return {provider: preferredProvider, fallback: false};
|
||||
}
|
||||
|
||||
if (!allowFallback) {
|
||||
throw new Error(Environment.getProviderSpeechToTextUnsupportedText(providerName(preferredProvider)));
|
||||
}
|
||||
}
|
||||
|
||||
const effectiveProvider = await resolveEffectiveAiProviderForUser(userId);
|
||||
if (isSpeechToTextConfigured(effectiveProvider)) {
|
||||
return {
|
||||
provider: effectiveProvider,
|
||||
fallback: preferredProvider !== undefined && preferredProvider !== effectiveProvider
|
||||
};
|
||||
}
|
||||
|
||||
const fallbackProvider = allowedProviders.find(isSpeechToTextConfigured);
|
||||
if (!fallbackProvider) {
|
||||
throw new Error(Environment.noSpeechToTextProviderForAccessText);
|
||||
}
|
||||
|
||||
return {provider: fallbackProvider, fallback: true};
|
||||
}
|
||||
|
||||
export async function transcribeSpeech(request: SpeechToTextRequest): Promise<TranscribedSpeech> {
|
||||
if (request.signal?.aborted) throw new Error("Aborted");
|
||||
|
||||
switch (request.provider) {
|
||||
case AiProvider.OPENAI:
|
||||
return transcribeOpenAiSpeech(request.audio, request.signal);
|
||||
case AiProvider.GEMINI:
|
||||
return transcribeGeminiSpeech(request.audio, request.signal);
|
||||
case AiProvider.MISTRAL:
|
||||
return transcribeMistralSpeech(request.audio, request.signal);
|
||||
case AiProvider.OLLAMA:
|
||||
return transcribeOllamaSpeech(request.audio, request.signal);
|
||||
}
|
||||
}
|
||||
|
||||
export async function transcribeSpeechDownloads(provider: AiProvider, downloads: AiDownloadedFile[], signal?: AbortSignal): Promise<string> {
|
||||
const audios = downloads.filter(isTranscribableAudioDownload);
|
||||
const transcriptions: string[] = [];
|
||||
|
||||
for (const [index, audio] of audios.entries()) {
|
||||
if (signal?.aborted) throw new Error("Aborted");
|
||||
|
||||
const result = await transcribeSpeech({provider, audio, signal});
|
||||
const text = result.text.trim();
|
||||
if (!text) continue;
|
||||
|
||||
transcriptions.push(audios.length > 1
|
||||
? `[${index + 1}. ${audio.fileName}]\n${text}`
|
||||
: text);
|
||||
}
|
||||
|
||||
return transcriptions.join("\n\n").trim();
|
||||
}
|
||||
|
||||
async function transcribeOpenAiSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||
const target = resolveAiRuntimeTarget(AiProvider.OPENAI, "speechToText");
|
||||
const openAi = createOpenAiClient(target);
|
||||
const file = fs.createReadStream(audio.path);
|
||||
try {
|
||||
const result = await openAi.audio.transcriptions.create({
|
||||
file,
|
||||
model: target.model,
|
||||
}, {signal});
|
||||
|
||||
return {
|
||||
provider: AiProvider.OPENAI,
|
||||
model: target.model,
|
||||
text: result.text || "",
|
||||
fileName: audio.fileName,
|
||||
};
|
||||
} finally {
|
||||
file.destroy();
|
||||
}
|
||||
}
|
||||
|
||||
async function transcribeMistralSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||
const target = resolveAiRuntimeTarget(AiProvider.MISTRAL, "speechToText");
|
||||
const mistralAi = createMistralClient(target);
|
||||
const result = await mistralAi.audio.transcriptions.complete({
|
||||
model: target.model,
|
||||
file: await openAsBlob(audio.path),
|
||||
}, {signal});
|
||||
|
||||
return {
|
||||
provider: AiProvider.MISTRAL,
|
||||
model: target.model,
|
||||
text: result.text || "",
|
||||
fileName: audio.fileName,
|
||||
};
|
||||
}
|
||||
|
||||
async function transcribeGeminiSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||
const target = resolveAiRuntimeTarget(AiProvider.GEMINI, "speechToText");
|
||||
const geminiAi = createGoogleGenAiClient(target);
|
||||
const response = await geminiAi.models.generateContent({
|
||||
model: target.model,
|
||||
contents: [{
|
||||
role: "user",
|
||||
parts: [
|
||||
{text: "Transcribe the attached audio verbatim. Reply only with the transcription text. Do not answer the speaker."},
|
||||
{
|
||||
inlineData: {
|
||||
data: audio.buffer.toString("base64"),
|
||||
mimeType: audio.mimeType || "audio/wav",
|
||||
}
|
||||
}
|
||||
]
|
||||
}],
|
||||
config: {
|
||||
temperature: 0,
|
||||
abortSignal: signal,
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
provider: AiProvider.GEMINI,
|
||||
model: target.model,
|
||||
text: collectGeminiText(response),
|
||||
fileName: audio.fileName,
|
||||
};
|
||||
}
|
||||
|
||||
async function transcribeOllamaSpeech(audio: AiDownloadedFile, signal?: AbortSignal): Promise<TranscribedSpeech> {
|
||||
if (signal?.aborted) throw new Error("Aborted");
|
||||
|
||||
const target = resolveAiRuntimeTarget(AiProvider.OLLAMA, "speechToText");
|
||||
const model = target.model;
|
||||
if (!isOllamaSpeechToTextModel(model)) {
|
||||
throw new Error(Environment.ollamaSpeechToTextModelRequiredText);
|
||||
}
|
||||
|
||||
const ollama = createOllamaClient(target);
|
||||
const response = await ollama.chat({
|
||||
model,
|
||||
stream: false,
|
||||
think: false,
|
||||
messages: [{
|
||||
role: "user",
|
||||
content: "Transcribe the attached audio verbatim. Reply only with the transcription text. Do not answer the speaker.",
|
||||
images: [audio.buffer.toString("base64")],
|
||||
}],
|
||||
options: {
|
||||
temperature: 0,
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
provider: AiProvider.OLLAMA,
|
||||
model,
|
||||
text: response?.message?.content || "",
|
||||
fileName: audio.fileName,
|
||||
};
|
||||
}
|
||||
|
||||
function collectGeminiText(response: any): string {
|
||||
if (typeof response?.text === "string") return response.text;
|
||||
|
||||
const candidates = response?.candidates ?? [];
|
||||
const candidateText = candidates
|
||||
.flatMap((candidate: any) => candidate?.content?.parts ?? [])
|
||||
.map((part: any) => part?.text ?? "")
|
||||
.join("");
|
||||
if (candidateText.trim()) return candidateText;
|
||||
|
||||
return (response?.candidates ?? [])
|
||||
.map((output: any) => typeof output === "string" ? output : output?.content?.parts?.[0]?.text ?? "")
|
||||
.join("");
|
||||
}
|
||||
@@ -0,0 +1,435 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import {randomUUID} from "node:crypto";
|
||||
import {FileOptions, Message} from "typescript-telegram-bot-api";
|
||||
import {AiProvider} from "../model/ai-provider";
|
||||
import {Environment} from "../common/environment";
|
||||
import {bot} from "../index";
|
||||
import {
|
||||
getAvailableAiProviderChoices,
|
||||
getProviderChoiceLabel,
|
||||
normalizeAiProviderChoice,
|
||||
resolveEffectiveAiProviderForUser,
|
||||
} from "../common/user-ai-settings";
|
||||
import {enqueueTelegramApiCall} from "../util/telegram-api-queue";
|
||||
import {MessageStore} from "../common/message-store";
|
||||
import {StoredAttachment} from "../model/stored-attachment";
|
||||
import {StoredMessage} from "../model/stored-message";
|
||||
import {logError} from "../util/utils";
|
||||
import {SpeechRequest} from "@mistralai/mistralai/models/components";
|
||||
import {createGoogleGenAiClient, createMistralClient, createOpenAiClient, resolveAiRuntimeTarget} from "./ai-runtime-target";
|
||||
|
||||
const MAX_TTS_TEXT_CHARS = 4096;
|
||||
const TELEGRAM_FILE_LIMIT_BYTES = 50 * 1024 * 1024;
|
||||
|
||||
export type TextToSpeechFormat = "mp3" | "wav" | "flac" | "opus" | "aac" | "pcm";
|
||||
|
||||
export type SynthesizedSpeech = {
|
||||
provider: AiProvider;
|
||||
model: string;
|
||||
voice?: string;
|
||||
format: TextToSpeechFormat;
|
||||
mimeType: string;
|
||||
fileName: string;
|
||||
path: string;
|
||||
sizeBytes: number;
|
||||
};
|
||||
|
||||
export type TextToSpeechRequest = {
|
||||
provider: AiProvider;
|
||||
text: string;
|
||||
voice?: string;
|
||||
};
|
||||
|
||||
export type TextToSpeechProviderResolution = {
|
||||
provider: AiProvider;
|
||||
fallback: boolean;
|
||||
};
|
||||
|
||||
type SpeechFileParams = Omit<SynthesizedSpeech, "fileName" | "path" | "sizeBytes"> & {
|
||||
buffer: Buffer;
|
||||
};
|
||||
|
||||
function ttsCacheDir(): string {
|
||||
return path.join(Environment.DATA_PATH, "cache", "audio");
|
||||
}
|
||||
|
||||
function providerName(provider: AiProvider): string {
|
||||
return getProviderChoiceLabel(provider);
|
||||
}
|
||||
|
||||
function assertText(text: string): string {
|
||||
const normalized = text.trim();
|
||||
if (!normalized) {
|
||||
throw new Error(Environment.noTextToSynthesizeText);
|
||||
}
|
||||
|
||||
if (normalized.length > MAX_TTS_TEXT_CHARS) {
|
||||
throw new Error(Environment.getTextToSpeechTooLongText(normalized.length, MAX_TTS_TEXT_CHARS));
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function isTextToSpeechConfigured(provider: AiProvider): boolean {
|
||||
switch (provider) {
|
||||
case AiProvider.OPENAI:
|
||||
const openAiTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
|
||||
return !!openAiTarget.apiKey && !!openAiTarget.model;
|
||||
case AiProvider.GEMINI:
|
||||
const geminiTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
|
||||
return !!geminiTarget.apiKey && !!geminiTarget.model;
|
||||
case AiProvider.MISTRAL:
|
||||
const mistralTarget = resolveAiRuntimeTarget(provider, "textToSpeech");
|
||||
return !!mistralTarget.apiKey && !!mistralTarget.model;
|
||||
case AiProvider.OLLAMA:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export async function resolveTextToSpeechProviderForUser(
|
||||
userId: number,
|
||||
explicitProvider?: AiProvider,
|
||||
): Promise<TextToSpeechProviderResolution> {
|
||||
const availableChoices = getAvailableAiProviderChoices(userId);
|
||||
const allowedProviders = availableChoices
|
||||
.map(choice => normalizeAiProviderChoice(choice))
|
||||
.filter((choice): choice is AiProvider => !!choice && choice !== "DEFAULT");
|
||||
|
||||
if (explicitProvider) {
|
||||
if (!allowedProviders.includes(explicitProvider)) {
|
||||
throw new Error(Environment.getProviderNotAvailableForAccessText(providerName(explicitProvider)));
|
||||
}
|
||||
|
||||
if (!isTextToSpeechConfigured(explicitProvider)) {
|
||||
throw new Error(Environment.getProviderTextToSpeechUnsupportedText(providerName(explicitProvider)));
|
||||
}
|
||||
|
||||
return {provider: explicitProvider, fallback: false};
|
||||
}
|
||||
|
||||
const effectiveProvider = await resolveEffectiveAiProviderForUser(userId);
|
||||
if (isTextToSpeechConfigured(effectiveProvider)) {
|
||||
return {provider: effectiveProvider, fallback: false};
|
||||
}
|
||||
|
||||
const fallbackProvider = allowedProviders.find(isTextToSpeechConfigured);
|
||||
if (!fallbackProvider) {
|
||||
throw new Error(Environment.noTextToSpeechProviderForAccessText);
|
||||
}
|
||||
|
||||
return {provider: fallbackProvider, fallback: true};
|
||||
}
|
||||
|
||||
export async function synthesizeSpeech(request: TextToSpeechRequest): Promise<SynthesizedSpeech> {
|
||||
const text = assertText(request.text);
|
||||
|
||||
switch (request.provider) {
|
||||
case AiProvider.OPENAI:
|
||||
return synthesizeOpenAiSpeech(text, request.voice);
|
||||
case AiProvider.GEMINI:
|
||||
return synthesizeGeminiSpeech(text, request.voice);
|
||||
case AiProvider.MISTRAL:
|
||||
return synthesizeMistralSpeech(text, request.voice);
|
||||
case AiProvider.OLLAMA:
|
||||
throw new Error(Environment.ollamaTextToSpeechUnsupportedText);
|
||||
}
|
||||
}
|
||||
|
||||
async function synthesizeOpenAiSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
|
||||
const target = resolveAiRuntimeTarget(AiProvider.OPENAI, "textToSpeech");
|
||||
const openAi = createOpenAiClient(target);
|
||||
const response = await openAi.audio.speech.create({
|
||||
model: target.model,
|
||||
voice: voice || Environment.OPENAI_TTS_VOICE,
|
||||
input: text,
|
||||
response_format: "mp3",
|
||||
instructions: Environment.OPENAI_TTS_INSTRUCTIONS,
|
||||
});
|
||||
|
||||
const buffer = Buffer.from(await response.arrayBuffer());
|
||||
|
||||
return writeSpeechFile({
|
||||
provider: AiProvider.OPENAI,
|
||||
model: target.model,
|
||||
voice: voice || Environment.OPENAI_TTS_VOICE,
|
||||
buffer,
|
||||
format: "mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
});
|
||||
}
|
||||
|
||||
async function synthesizeMistralSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
|
||||
const target = resolveAiRuntimeTarget(AiProvider.MISTRAL, "textToSpeech");
|
||||
const mistralAi = createMistralClient(target);
|
||||
const request: SpeechRequest = {
|
||||
input: text,
|
||||
responseFormat: "mp3"
|
||||
// stream: false,
|
||||
};
|
||||
|
||||
if (target.model) request.model = target.model;
|
||||
if (voice || Environment.MISTRAL_TTS_VOICE_ID) request.voiceId = voice || Environment.MISTRAL_TTS_VOICE_ID;
|
||||
|
||||
const response: any = await mistralAi.audio.speech.complete(request);
|
||||
const audioData = response?.audioData ?? response?.audio_data;
|
||||
if (typeof audioData !== "string" || !audioData.trim()) {
|
||||
throw new Error(Environment.mistralTtsNoAudioDataText);
|
||||
}
|
||||
|
||||
const buffer = Buffer.from(audioData, "base64");
|
||||
|
||||
return writeSpeechFile({
|
||||
provider: AiProvider.MISTRAL,
|
||||
model: target.model || "mistral speech",
|
||||
voice: voice || Environment.MISTRAL_TTS_VOICE_ID,
|
||||
buffer,
|
||||
format: "mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
});
|
||||
}
|
||||
|
||||
async function synthesizeGeminiSpeech(text: string, voice?: string): Promise<SynthesizedSpeech> {
|
||||
const target = resolveAiRuntimeTarget(AiProvider.GEMINI, "textToSpeech");
|
||||
const geminiAi = createGoogleGenAiClient(target);
|
||||
const response: any = await geminiAi.models.generateContent({
|
||||
model: target.model,
|
||||
contents: text,
|
||||
config: {
|
||||
responseModalities: ["AUDIO"],
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: voice || Environment.GEMINI_TTS_VOICE,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const audioPart = findGeminiAudioPart(response);
|
||||
if (!audioPart) {
|
||||
throw new Error(Environment.geminiTextToSpeechUnsupportedText);
|
||||
}
|
||||
|
||||
const decoded = decodeGeminiAudio(audioPart.data, audioPart.mimeType);
|
||||
|
||||
return writeSpeechFile({
|
||||
provider: AiProvider.GEMINI,
|
||||
model: target.model,
|
||||
voice: voice || Environment.GEMINI_TTS_VOICE,
|
||||
buffer: decoded.buffer,
|
||||
format: decoded.format,
|
||||
mimeType: decoded.mimeType,
|
||||
});
|
||||
}
|
||||
|
||||
function findGeminiAudioPart(value: unknown): { data: string; mimeType?: string } | null {
|
||||
if (!value || typeof value !== "object") return null;
|
||||
const record = value as Record<string, unknown>;
|
||||
|
||||
const inlineData = record.inlineData ?? record.inline_data;
|
||||
if (inlineData && typeof inlineData === "object") {
|
||||
const inlineRecord = inlineData as Record<string, unknown>;
|
||||
const data = inlineRecord.data;
|
||||
const mimeType = inlineRecord.mimeType ?? inlineRecord.mime_type;
|
||||
|
||||
if (typeof data === "string" && (!mimeType || String(mimeType).startsWith("audio/"))) {
|
||||
return {data, mimeType: typeof mimeType === "string" ? mimeType : undefined};
|
||||
}
|
||||
}
|
||||
|
||||
for (const child of Object.values(record)) {
|
||||
if (Array.isArray(child)) {
|
||||
for (const item of child) {
|
||||
const found = findGeminiAudioPart(item);
|
||||
if (found) return found;
|
||||
}
|
||||
} else if (child && typeof child === "object") {
|
||||
const found = findGeminiAudioPart(child);
|
||||
if (found) return found;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function decodeGeminiAudio(data: string, mimeType = "audio/wav"): {
|
||||
buffer: Buffer;
|
||||
format: TextToSpeechFormat;
|
||||
mimeType: string;
|
||||
} {
|
||||
const normalizedMime = mimeType.toLowerCase();
|
||||
const raw = Buffer.from(data, "base64");
|
||||
|
||||
if (normalizedMime.includes("mpeg") || normalizedMime.includes("mp3")) {
|
||||
return {buffer: raw, format: "mp3", mimeType: "audio/mpeg"};
|
||||
}
|
||||
|
||||
if (normalizedMime.includes("wav") || raw.subarray(0, 4).toString("ascii") === "RIFF") {
|
||||
return {buffer: raw, format: "wav", mimeType: "audio/wav"};
|
||||
}
|
||||
|
||||
if (normalizedMime.includes("flac")) {
|
||||
return {buffer: raw, format: "flac", mimeType: "audio/flac"};
|
||||
}
|
||||
|
||||
if (normalizedMime.includes("opus")) {
|
||||
return {buffer: raw, format: "opus", mimeType: "audio/opus"};
|
||||
}
|
||||
|
||||
if (normalizedMime.includes("aac")) {
|
||||
return {buffer: raw, format: "aac", mimeType: "audio/aac"};
|
||||
}
|
||||
|
||||
const sampleRate = Number(/rate=(\d+)/i.exec(mimeType)?.[1]) || 24_000;
|
||||
return {
|
||||
buffer: wrapPcm16InWav(raw, sampleRate, 1),
|
||||
format: "wav",
|
||||
mimeType: "audio/wav",
|
||||
};
|
||||
}
|
||||
|
||||
function wrapPcm16InWav(pcm: Buffer, sampleRate: number, channels: number): Buffer {
|
||||
const bitsPerSample = 16;
|
||||
const byteRate = sampleRate * channels * bitsPerSample / 8;
|
||||
const blockAlign = channels * bitsPerSample / 8;
|
||||
const header = Buffer.alloc(44);
|
||||
|
||||
header.write("RIFF", 0);
|
||||
header.writeUInt32LE(36 + pcm.length, 4);
|
||||
header.write("WAVE", 8);
|
||||
header.write("fmt ", 12);
|
||||
header.writeUInt32LE(16, 16);
|
||||
header.writeUInt16LE(1, 20);
|
||||
header.writeUInt16LE(channels, 22);
|
||||
header.writeUInt32LE(sampleRate, 24);
|
||||
header.writeUInt32LE(byteRate, 28);
|
||||
header.writeUInt16LE(blockAlign, 32);
|
||||
header.writeUInt16LE(bitsPerSample, 34);
|
||||
header.write("data", 36);
|
||||
header.writeUInt32LE(pcm.length, 40);
|
||||
|
||||
return Buffer.concat([header, pcm]);
|
||||
}
|
||||
|
||||
function writeSpeechFile(params: SpeechFileParams): SynthesizedSpeech {
|
||||
fs.mkdirSync(ttsCacheDir(), {recursive: true});
|
||||
|
||||
const fileName = `${params.provider.toLowerCase()}-tts-${Date.now()}-${randomUUID()}.${params.format}`;
|
||||
const filePath = path.join(ttsCacheDir(), fileName);
|
||||
fs.writeFileSync(filePath, params.buffer);
|
||||
|
||||
return {
|
||||
provider: params.provider,
|
||||
model: params.model,
|
||||
voice: params.voice,
|
||||
format: params.format,
|
||||
mimeType: params.mimeType,
|
||||
fileName,
|
||||
path: filePath,
|
||||
sizeBytes: params.buffer.length,
|
||||
};
|
||||
}
|
||||
|
||||
function createSpeechUpload(speech: SynthesizedSpeech): FileOptions {
|
||||
return new FileOptions(fs.createReadStream(speech.path), {
|
||||
filename: speech.fileName,
|
||||
contentType: speech.mimeType,
|
||||
});
|
||||
}
|
||||
|
||||
function destroyUpload(upload: FileOptions): void {
|
||||
if ("destroy" in upload.file && typeof upload.file.destroy === "function") {
|
||||
upload.file.destroy();
|
||||
}
|
||||
}
|
||||
|
||||
export async function sendSynthesizedSpeech(sourceMessage: Message, speech: SynthesizedSpeech): Promise<Message> {
|
||||
if (speech.sizeBytes > TELEGRAM_FILE_LIMIT_BYTES) {
|
||||
throw new Error(Environment.speechFileTooLargeText);
|
||||
}
|
||||
|
||||
const caption = Environment.getTextToSpeechCaption(providerName(speech.provider), speech.model, speech.voice);
|
||||
|
||||
await enqueueTelegramApiCall(
|
||||
() => bot.sendChatAction({
|
||||
chat_id: sourceMessage.chat.id,
|
||||
action: speech.format === "mp3" || speech.format === "opus" ? "upload_voice" : "upload_document",
|
||||
}),
|
||||
{method: "sendChatAction", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
|
||||
).catch(logError);
|
||||
|
||||
let sent: Message;
|
||||
if (speech.format === "mp3" || speech.format === "opus") {
|
||||
try {
|
||||
sent = await enqueueTelegramApiCall(
|
||||
async () => {
|
||||
const upload = createSpeechUpload(speech);
|
||||
try {
|
||||
return await bot.sendVoice({
|
||||
chat_id: sourceMessage.chat.id,
|
||||
voice: upload,
|
||||
caption,
|
||||
reply_parameters: {message_id: sourceMessage.message_id},
|
||||
});
|
||||
} finally {
|
||||
destroyUpload(upload);
|
||||
}
|
||||
},
|
||||
{method: "sendVoice", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
|
||||
);
|
||||
} catch (e) {
|
||||
logError(e);
|
||||
sent = await sendSpeechDocument(sourceMessage, speech, caption);
|
||||
}
|
||||
} else {
|
||||
sent = await sendSpeechDocument(sourceMessage, speech, caption);
|
||||
}
|
||||
|
||||
await storeSpeechMessage(sent, sourceMessage, speech);
|
||||
return sent;
|
||||
}
|
||||
|
||||
async function sendSpeechDocument(sourceMessage: Message, speech: SynthesizedSpeech, caption: string): Promise<Message> {
|
||||
return enqueueTelegramApiCall(
|
||||
async () => {
|
||||
const upload = createSpeechUpload(speech);
|
||||
try {
|
||||
return await bot.sendDocument({
|
||||
chat_id: sourceMessage.chat.id,
|
||||
document: upload,
|
||||
caption,
|
||||
reply_parameters: {message_id: sourceMessage.message_id},
|
||||
});
|
||||
} finally {
|
||||
destroyUpload(upload);
|
||||
}
|
||||
},
|
||||
{method: "sendDocument", chatId: sourceMessage.chat.id, chatType: sourceMessage.chat.type}
|
||||
);
|
||||
}
|
||||
|
||||
async function storeSpeechMessage(sent: Message, sourceMessage: Message, speech: SynthesizedSpeech): Promise<void> {
|
||||
const file = sent.voice ?? sent.audio ?? sent.document;
|
||||
const attachment: StoredAttachment = {
|
||||
kind: "audio",
|
||||
fileId: file?.file_id ?? speech.path,
|
||||
fileUniqueId: file?.file_unique_id,
|
||||
fileName: speech.fileName,
|
||||
mimeType: speech.mimeType,
|
||||
cachePath: speech.path,
|
||||
};
|
||||
|
||||
const stored: StoredMessage = {
|
||||
chatId: sent.chat.id,
|
||||
id: sent.message_id,
|
||||
replyToMessageId: sent.reply_to_message?.message_id ?? sourceMessage.message_id,
|
||||
fromId: sent.from?.id ?? 0,
|
||||
text: sent.caption ?? speech.fileName,
|
||||
date: sent.date ?? Math.floor(Date.now() / 1000),
|
||||
attachments: [attachment],
|
||||
};
|
||||
|
||||
await MessageStore.put(stored);
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
import {Message} from "typescript-telegram-bot-api";
|
||||
import {Command} from "../base/command";
|
||||
import {AiProvider} from "../model/ai-provider";
|
||||
import {
|
||||
isTranscribableAudioDownload,
|
||||
resolveSpeechToTextProviderForUser,
|
||||
transcribeSpeechDownloads,
|
||||
} from "../ai/speech-to-text";
|
||||
import {attachmentsToDownloadedFiles, cacheMessageAttachments} from "../ai/telegram-attachments";
|
||||
import {MessageStore} from "../common/message-store";
|
||||
import {StoredAttachment} from "../model/stored-attachment";
|
||||
import {logError, replyToMessage} from "../util/utils";
|
||||
import {Environment} from "../common/environment";
|
||||
|
||||
const TELEGRAM_LIMIT = 4096;
|
||||
|
||||
const PROVIDER_ALIASES = new Map<string, AiProvider>([
|
||||
["openai", AiProvider.OPENAI],
|
||||
["chatgpt", AiProvider.OPENAI],
|
||||
["gpt", AiProvider.OPENAI],
|
||||
["gemini", AiProvider.GEMINI],
|
||||
["google", AiProvider.GEMINI],
|
||||
["mistral", AiProvider.MISTRAL],
|
||||
["ollama", AiProvider.OLLAMA],
|
||||
]);
|
||||
|
||||
function parseProviderToken(token: string | undefined): AiProvider | undefined {
|
||||
if (!token) return undefined;
|
||||
return PROVIDER_ALIASES.get(token.toLowerCase().replace(/:$/, ""));
|
||||
}
|
||||
|
||||
async function collectStoredAttachments(msg: Message | undefined): Promise<StoredAttachment[]> {
|
||||
if (!msg) return [];
|
||||
|
||||
const stored = await MessageStore.get(msg.chat.id, msg.message_id);
|
||||
if (stored?.attachments?.length) return stored.attachments;
|
||||
|
||||
return cacheMessageAttachments(msg);
|
||||
}
|
||||
|
||||
async function collectAudioDownloads(msg: Message) {
|
||||
const attachments = [
|
||||
...await collectStoredAttachments(msg),
|
||||
...await collectStoredAttachments(msg.reply_to_message),
|
||||
];
|
||||
const seen = new Set<string>();
|
||||
|
||||
return attachmentsToDownloadedFiles(attachments)
|
||||
.filter(isTranscribableAudioDownload)
|
||||
.filter(download => {
|
||||
const key = `${download.fileId}:${download.path}`;
|
||||
if (seen.has(key)) return false;
|
||||
seen.add(key);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
export class SpeechToText extends Command {
|
||||
command = ["stt", "transcribe"];
|
||||
argsMode = "optional" as const;
|
||||
|
||||
title = Environment.commandTitles.speechToText;
|
||||
description = Environment.commandDescriptions.speechToText;
|
||||
|
||||
async execute(msg: Message, match?: RegExpExecArray | null): Promise<void> {
|
||||
if (!msg.from) return;
|
||||
|
||||
const args = match?.[3]?.trim() ?? "";
|
||||
const explicitProvider = parseProviderToken(args.split(/\s+/)[0]);
|
||||
const downloads = await collectAudioDownloads(msg);
|
||||
|
||||
if (!downloads.length) {
|
||||
await replyToMessage({
|
||||
message: msg,
|
||||
text: Environment.speechToTextInstructionText,
|
||||
}).catch(logError);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const resolved = await resolveSpeechToTextProviderForUser(msg.from.id, explicitProvider, {
|
||||
allowFallback: !explicitProvider,
|
||||
});
|
||||
const transcript = await transcribeSpeechDownloads(resolved.provider, downloads);
|
||||
const text = transcript.trim() || Environment.speechToTextEmptyResultText;
|
||||
|
||||
await replyToMessage({
|
||||
message: msg,
|
||||
text: text.length > TELEGRAM_LIMIT ? text.slice(0, TELEGRAM_LIMIT - 3) + "..." : text,
|
||||
}).catch(logError);
|
||||
} catch (e) {
|
||||
logError(e);
|
||||
await replyToMessage({
|
||||
message: msg,
|
||||
text: e instanceof Error ? e.message : String(e),
|
||||
}).catch(logError);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
import {Message} from "typescript-telegram-bot-api";
|
||||
import {Command} from "../base/command";
|
||||
import {AiProvider} from "../model/ai-provider";
|
||||
import {
|
||||
resolveTextToSpeechProviderForUser,
|
||||
sendSynthesizedSpeech,
|
||||
synthesizeSpeech,
|
||||
} from "../ai/text-to-speech";
|
||||
import {logError, replyToMessage} from "../util/utils";
|
||||
import {Environment} from "../common/environment";
|
||||
|
||||
const PROVIDER_ALIASES = new Map<string, AiProvider>([
|
||||
["openai", AiProvider.OPENAI],
|
||||
["chatgpt", AiProvider.OPENAI],
|
||||
["gpt", AiProvider.OPENAI],
|
||||
["gemini", AiProvider.GEMINI],
|
||||
["google", AiProvider.GEMINI],
|
||||
["mistral", AiProvider.MISTRAL],
|
||||
["ollama", AiProvider.OLLAMA],
|
||||
]);
|
||||
|
||||
function parseProviderToken(token: string | undefined): AiProvider | undefined {
|
||||
if (!token) return undefined;
|
||||
return PROVIDER_ALIASES.get(token.toLowerCase().replace(/:$/, ""));
|
||||
}
|
||||
|
||||
export class TextToSpeech extends Command {
|
||||
command = ["tts", "say", "voice"];
|
||||
argsMode = "optional" as const;
|
||||
|
||||
title = Environment.commandTitles.textToSpeech;
|
||||
description = Environment.commandDescriptions.textToSpeech;
|
||||
|
||||
async execute(msg: Message, match?: RegExpExecArray | null): Promise<void> {
|
||||
if (!msg.from) return;
|
||||
|
||||
const args = match?.[3]?.trim() ?? "";
|
||||
const replyText = (msg.reply_to_message?.text ?? msg.reply_to_message?.caption ?? "").trim();
|
||||
const [firstToken = "", ...restTokens] = args.split(/\s+/);
|
||||
const explicitProvider = parseProviderToken(firstToken);
|
||||
const text = explicitProvider
|
||||
? (restTokens.join(" ").trim() || replyText)
|
||||
: (args || replyText);
|
||||
|
||||
if (!text.trim()) {
|
||||
await replyToMessage({
|
||||
message: msg,
|
||||
text: Environment.textToSpeechInstructionText,
|
||||
}).catch(logError);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const resolved = await resolveTextToSpeechProviderForUser(msg.from.id, explicitProvider);
|
||||
const speech = await synthesizeSpeech({provider: resolved.provider, text});
|
||||
await sendSynthesizedSpeech(msg, speech);
|
||||
} catch (e) {
|
||||
logError(e);
|
||||
await replyToMessage({
|
||||
message: msg,
|
||||
text: e instanceof Error ? e.message : String(e),
|
||||
}).catch(logError);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user