From 78932e82af917d22c88283c5648da9ced9601f3c Mon Sep 17 00:00:00 2001 From: Danil Nikolaev Date: Wed, 13 May 2026 16:07:47 +0300 Subject: [PATCH] shitton --- src/ai/ai-runtime-target.ts | 2 ++ src/ai/provider-model-runtime.ts | 6 +++--- src/ai/unified-ai-runner.gemini.ts | 24 ++++++++++++++++++++++-- src/ai/unified-ai-runner.shared.ts | 3 +++ 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/ai/ai-runtime-target.ts b/src/ai/ai-runtime-target.ts index 6d1fa6d..4819675 100644 --- a/src/ai/ai-runtime-target.ts +++ b/src/ai/ai-runtime-target.ts @@ -119,6 +119,8 @@ export function getDefaultModelForPurpose(provider: AiProvider, purpose: AiRunti } case AiProvider.GEMINI: switch (purpose) { + case "vision": + case "ocr": case "outputImages": return Environment.GEMINI_IMAGE_MODEL; case "speechToText": diff --git a/src/ai/provider-model-runtime.ts b/src/ai/provider-model-runtime.ts index 14cd623..6f0b560 100644 --- a/src/ai/provider-model-runtime.ts +++ b/src/ai/provider-model-runtime.ts @@ -164,14 +164,14 @@ export async function getModelCapabilities( case AiProvider.GEMINI: { const chatLike = lowerModelName(model).startsWith("gemini-") && !isGeminiNonChatModel(model); const reasoningModel = lowerModelName(model).includes("2.5") || lowerModelName(model).includes("thinking"); - const imageTarget = resolveAiRuntimeTarget(provider, "outputImages"); + const imageTarget = resolveAiRuntimeTarget(provider, "vision"); const speechTarget = resolveAiRuntimeTarget(provider, "speechToText"); const ttsTarget = resolveAiRuntimeTarget(provider, "textToSpeech"); return buildCapabilities({ chat: capability(true, target, runtimeTarget), - vision: capability(chatLike, target, runtimeTarget), - ocr: capability(chatLike, target, runtimeTarget), + vision: capability(!!imageTarget.apiKey && !!imageTarget.model, imageTarget, runtimeTarget), + ocr: capability(!!imageTarget.apiKey && !!imageTarget.model, imageTarget, runtimeTarget), thinking: capability(reasoningModel, target, runtimeTarget), extendedThinking: capability(reasoningModel, target, runtimeTarget), tools: capability(chatLike, target, runtimeTarget), diff --git a/src/ai/unified-ai-runner.gemini.ts b/src/ai/unified-ai-runner.gemini.ts index 2bb9fdb..723a690 100644 --- a/src/ai/unified-ai-runner.gemini.ts +++ b/src/ai/unified-ai-runner.gemini.ts @@ -6,7 +6,19 @@ import {GeminiMessage} from "./gemini-chat-message"; import {createGoogleGenAiClient} from "./ai-runtime-target"; import {aiLog, aiLogDuration, aiLogProviderTarget, aiLogToolCall} from "../logging/ai-logger"; -import {AsyncIterableStream, GeminiFunctionCallLike, GeminiResponseLike, MAX_TOOL_ROUNDS, RuntimeConfigSnapshot, ToolCallData, ToolExecutionMemory, executeToolBatch, roundStatus, safeJsonParseObject, GeminiGenerationRequest} from "./unified-ai-runner.shared"; +import { + AsyncIterableStream, + executeToolBatch, + GeminiFunctionCallLike, + GeminiGenerationRequest, + GeminiResponseLike, + MAX_TOOL_ROUNDS, + roundStatus, + RuntimeConfigSnapshot, + safeJsonParseObject, + ToolCallData, + ToolExecutionMemory +} from "./unified-ai-runner.shared"; function collectGeminiResponseText(response: GeminiResponseLike & { text?: string }): string { if (typeof response.text === "string") return response.text; @@ -88,6 +100,14 @@ export async function runGemini( hasToolInputFiles: !!toolContext.pythonInputFiles?.length, }); + // TODO: 13.05.2026, Danil Nikolaev: find a better way? + const imageCount = messages.reduce((sum, m) => { + return sum + (m.parts.filter(p => "inlineData" in p && "mimeType" in p.inlineData && p.inlineData.mimeType.startsWith("image")).length) + }, 0); + + const target = imageCount ? config.geminiImageTarget : config.geminiChatTarget; + const model = target.model; + const toolMemory: ToolExecutionMemory = new Map(); for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { @@ -99,7 +119,7 @@ export async function runGemini( await streamMessage.flush(); const request: GeminiGenerationRequest = { - model: config.geminiChatTarget.model, + model: model, contents: messages, config: { tools: getGeminiTools(), diff --git a/src/ai/unified-ai-runner.shared.ts b/src/ai/unified-ai-runner.shared.ts index 7af3ebe..8b870e6 100644 --- a/src/ai/unified-ai-runner.shared.ts +++ b/src/ai/unified-ai-runner.shared.ts @@ -279,6 +279,7 @@ export type RuntimeConfigSnapshot = { ollamaRagMaxArchiveDepth: number; geminiChatTarget: AiRuntimeTarget; + geminiImageTarget: AiRuntimeTarget; mistralChatTarget: AiRuntimeTarget; @@ -310,6 +311,7 @@ export function snapshotRuntimeConfig(): RuntimeConfigSnapshot { ollamaRagMaxArchiveDepth: Environment.OLLAMA_RAG_MAX_ARCHIVE_DEPTH, geminiChatTarget: resolveAiRuntimeTarget(AiProvider.GEMINI, "chat"), + geminiImageTarget: resolveAiRuntimeTarget(AiProvider.GEMINI, "vision"), mistralChatTarget: resolveAiRuntimeTarget(AiProvider.MISTRAL, "chat"), @@ -641,6 +643,7 @@ export async function rejectUnsupportedAttachments( if (!unsupported) return false; if (!kinds.has("audio")) { + // TODO: 13.05.2026, Danil Nikolaev: add "Regenerate" button await replyToMessage({ message: msg, text: unsupportedAttachmentText(provider, effectiveModel, unsupported),