From 1f793e8750c84ac6f7c509415bc5adaa4b37ff92 Mon Sep 17 00:00:00 2001 From: Nawaz Dhandala Date: Mon, 29 Dec 2025 15:54:18 +0000 Subject: [PATCH] feat: Add job to reset stuck AI Agent tasks to Scheduled status --- Worker/Jobs/AIAgent/TimeoutStuckTasks.ts | 81 ++++++++++++++++++++++++ Worker/Routes.ts | 1 + 2 files changed, 82 insertions(+) create mode 100644 Worker/Jobs/AIAgent/TimeoutStuckTasks.ts diff --git a/Worker/Jobs/AIAgent/TimeoutStuckTasks.ts b/Worker/Jobs/AIAgent/TimeoutStuckTasks.ts new file mode 100644 index 0000000000..7d40a3eec6 --- /dev/null +++ b/Worker/Jobs/AIAgent/TimeoutStuckTasks.ts @@ -0,0 +1,81 @@ +import RunCron from "../../Utils/Cron"; +import OneUptimeDate from "Common/Types/Date"; +import AIAgentTaskStatus from "Common/Types/AI/AIAgentTaskStatus"; +import { EVERY_MINUTE } from "Common/Utils/CronTime"; +import AIAgentTaskService from "Common/Server/Services/AIAgentTaskService"; +import QueryHelper from "Common/Server/Types/Database/QueryHelper"; +import AIAgentTask from "Common/Models/DatabaseModels/AIAgentTask"; +import logger from "Common/Server/Utils/Logger"; + +/** + * AI Agent tasks that are stuck in "InProgress" status for more than 30 minutes + * are considered timed out. This can happen when the AI Agent container crashes + * or is deleted while processing a task. + * + * This job resets stuck tasks back to "Scheduled" status so they can be retried. + */ + +const TASK_TIMEOUT_MINUTES: number = 30; + +RunCron( + "AIAgent:TimeoutStuckTasks", + { + schedule: EVERY_MINUTE, + runOnStartup: false, + }, + async () => { + logger.debug("Checking for stuck AI Agent tasks"); + + const timeoutThreshold: Date = + OneUptimeDate.getSomeMinutesAgo(TASK_TIMEOUT_MINUTES); + + // Find tasks that have been InProgress for longer than the timeout threshold + const stuckTasks: Array = await AIAgentTaskService.findAllBy({ + query: { + status: AIAgentTaskStatus.InProgress, + startedAt: QueryHelper.lessThan(timeoutThreshold), + }, + select: { + _id: true, + startedAt: true, + }, + props: { + isRoot: true, + }, + }); + + if (stuckTasks.length === 0) { + return; + } + + logger.info( + `Found ${stuckTasks.length} stuck AI Agent task(s). Resetting to Scheduled status.`, + ); + + for (const task of stuckTasks) { + try { + await AIAgentTaskService.updateOneById({ + id: task.id!, + data: { + status: AIAgentTaskStatus.Scheduled, + statusMessage: + "Task was reset due to timeout. The AI Agent processing this task may have crashed or been terminated.", + startedAt: null, + }, + props: { + isRoot: true, + }, + }); + + logger.info( + `Reset stuck AI Agent task ${task.id?.toString()} to Scheduled status`, + ); + } catch (error) { + logger.error( + `Failed to reset stuck AI Agent task ${task.id?.toString()}:`, + ); + logger.error(error); + } + } + }, +); diff --git a/Worker/Routes.ts b/Worker/Routes.ts index 60924e13bd..7ac5e159c2 100644 --- a/Worker/Routes.ts +++ b/Worker/Routes.ts @@ -90,6 +90,7 @@ import "./Jobs/Probe/UpdateConnectionStatus"; // AI Agents import "./Jobs/AIAgent/SendOwnerAddedNotification"; import "./Jobs/AIAgent/UpdateConnectionStatus"; +import "./Jobs/AIAgent/TimeoutStuckTasks"; // Telemetry Monitors. import "./Jobs/TelemetryMonitor/MonitorTelemetryMonitor";