Files
oneuptime/Common/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts

1135 lines
38 KiB
TypeScript

import logger from "../Logger";
import VMUtil from "../VM/VMAPI";
import APIRequestCriteria from "./Criteria/APIRequestCriteria";
import CustomCodeMonitoringCriteria from "./Criteria/CustomCodeMonitorCriteria";
import IncomingEmailCriteria from "./Criteria/IncomingEmailCriteria";
import IncomingRequestCriteria from "./Criteria/IncomingRequestCriteria";
import SSLMonitorCriteria from "./Criteria/SSLMonitorCriteria";
import ServerMonitorCriteria from "./Criteria/ServerMonitorCriteria";
import SyntheticMonitoringCriteria from "./Criteria/SyntheticMonitor";
import LogMonitorCriteria from "./Criteria/LogMonitorCriteria";
import MetricMonitorCriteria from "./Criteria/MetricMonitorCriteria";
import TraceMonitorCriteria from "./Criteria/TraceMonitorCriteria";
import ExceptionMonitorCriteria from "./Criteria/ExceptionMonitorCriteria";
import ProfileMonitorCriteria from "./Criteria/ProfileMonitorCriteria";
import SnmpMonitorCriteria from "./Criteria/SnmpMonitorCriteria";
import DnsMonitorCriteria from "./Criteria/DnsMonitorCriteria";
import DomainMonitorCriteria from "./Criteria/DomainMonitorCriteria";
import ExternalStatusPageMonitorCriteria from "./Criteria/ExternalStatusPageMonitorCriteria";
import MonitorCriteriaMessageBuilder from "./MonitorCriteriaMessageBuilder";
import MonitorCriteriaDataExtractor from "./MonitorCriteriaDataExtractor";
import MonitorCriteriaMessageFormatter from "./MonitorCriteriaMessageFormatter";
import DataToProcess from "./DataToProcess";
import Monitor from "../../../Models/DatabaseModels/Monitor";
import MonitorCriteria from "../../../Types/Monitor/MonitorCriteria";
import MonitorCriteriaInstance from "../../../Types/Monitor/MonitorCriteriaInstance";
import MonitorStep from "../../../Types/Monitor/MonitorStep";
import FilterCondition from "../../../Types/Filter/FilterCondition";
import MonitorEvaluationSummary, {
MonitorEvaluationCriteriaResult,
MonitorEvaluationEvent,
MonitorEvaluationFilterResult,
} from "../../../Types/Monitor/MonitorEvaluationSummary";
import ProbeApiIngestResponse from "../../../Types/Probe/ProbeApiIngestResponse";
import ProbeMonitorResponse from "../../../Types/Probe/ProbeMonitorResponse";
import RequestFailedDetails from "../../../Types/Probe/RequestFailedDetails";
import IncomingMonitorRequest from "../../../Types/Monitor/IncomingMonitor/IncomingMonitorRequest";
import MonitorType from "../../../Types/Monitor/MonitorType";
import { CheckOn, CriteriaFilter } from "../../../Types/Monitor/CriteriaFilter";
import OneUptimeDate from "../../../Types/Date";
import { JSONObject } from "../../../Types/JSON";
import Typeof from "../../../Types/Typeof";
import ReturnResult from "../../../Types/IsolatedVM/ReturnResult";
import URL from "../../../Types/API/URL";
import IP from "../../../Types/IP/IP";
import Hostname from "../../../Types/API/Hostname";
import Port from "../../../Types/Port";
import MetricMonitorResponse, {
KubernetesAffectedResource,
KubernetesResourceBreakdown,
} from "../../../Types/Monitor/MetricMonitor/MetricMonitorResponse";
export default class MonitorCriteriaEvaluator {
public static async processMonitorStep(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
probeApiIngestResponse: ProbeApiIngestResponse;
evaluationSummary: MonitorEvaluationSummary;
}): Promise<ProbeApiIngestResponse> {
const criteria: MonitorCriteria | undefined =
input.monitorStep.data?.monitorCriteria;
if (!criteria || !criteria.data) {
return input.probeApiIngestResponse;
}
for (const criteriaInstance of criteria.data.monitorCriteriaInstanceArray) {
const criteriaResult: MonitorEvaluationCriteriaResult = {
criteriaId: criteriaInstance.data?.id,
criteriaName: criteriaInstance.data?.name,
filterCondition:
criteriaInstance.data?.filterCondition || FilterCondition.All,
met: false,
message: "",
filters: [],
};
input.evaluationSummary.criteriaResults.push(criteriaResult);
const rootCause: string | null =
await MonitorCriteriaEvaluator.processMonitorCriteriaInstance({
dataToProcess: input.dataToProcess,
monitorStep: input.monitorStep,
monitor: input.monitor,
probeApiIngestResponse: input.probeApiIngestResponse,
criteriaInstance: criteriaInstance,
criteriaResult: criteriaResult,
});
if (!criteriaResult.message) {
criteriaResult.message = criteriaResult.met
? "Criteria met."
: "Criteria was not met.";
}
const criteriaEvent: MonitorEvaluationEvent = {
type: criteriaResult.met ? "criteria-met" : "criteria-not-met",
title: `${criteriaResult.met ? "Criteria met" : "Criteria not met"}: ${criteriaResult.criteriaName || "Unnamed criteria"}`,
message: criteriaResult.message,
relatedCriteriaId: criteriaResult.criteriaId,
at: OneUptimeDate.getCurrentDate(),
};
input.evaluationSummary.events.push(criteriaEvent);
if (rootCause) {
input.probeApiIngestResponse.criteriaMetId = criteriaInstance.data?.id;
input.probeApiIngestResponse.rootCause = `
**Created because the following criteria was met**:
**Criteria Name**: ${criteriaInstance.data?.name}
`;
input.probeApiIngestResponse.rootCause += `
**Filter Conditions Met**: ${rootCause}
`;
const contextBlock: string | null =
MonitorCriteriaEvaluator.buildRootCauseContext({
dataToProcess: input.dataToProcess,
monitorStep: input.monitorStep,
monitor: input.monitor,
});
if (contextBlock) {
input.probeApiIngestResponse.rootCause += `
${contextBlock}
`;
}
if ((input.dataToProcess as ProbeMonitorResponse).failureCause) {
input.probeApiIngestResponse.rootCause += `
**Cause**: ${(input.dataToProcess as ProbeMonitorResponse).failureCause || ""}
`;
}
break;
}
}
return input.probeApiIngestResponse;
}
private static async processMonitorCriteriaInstance(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
probeApiIngestResponse: ProbeApiIngestResponse;
criteriaInstance: MonitorCriteriaInstance;
criteriaResult: MonitorEvaluationCriteriaResult;
}): Promise<string | null> {
return MonitorCriteriaEvaluator.isMonitorInstanceCriteriaFiltersMet({
dataToProcess: input.dataToProcess,
monitorStep: input.monitorStep,
monitor: input.monitor,
probeApiIngestResponse: input.probeApiIngestResponse,
criteriaInstance: input.criteriaInstance,
criteriaResult: input.criteriaResult,
});
}
private static async isMonitorInstanceCriteriaFiltersMet(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
probeApiIngestResponse: ProbeApiIngestResponse;
criteriaInstance: MonitorCriteriaInstance;
criteriaResult: MonitorEvaluationCriteriaResult;
}): Promise<string | null> {
const filterCondition: FilterCondition =
input.criteriaInstance.data?.filterCondition || FilterCondition.All;
const matchedFilterMessages: Array<string> = [];
let hasMatch: boolean = false;
let allFiltersMet: boolean = true;
for (const criteriaFilter of input.criteriaInstance.data?.filters || []) {
const rootCause: string | null =
await MonitorCriteriaEvaluator.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
monitorStep: input.monitorStep,
monitor: input.monitor,
probeApiIngestResponse: input.probeApiIngestResponse,
criteriaInstance: input.criteriaInstance,
criteriaFilter: criteriaFilter,
});
const didMeetCriteria: boolean = Boolean(rootCause);
const filterMessage: string =
MonitorCriteriaMessageBuilder.buildCriteriaFilterMessage({
monitor: input.monitor,
criteriaFilter: criteriaFilter,
dataToProcess: input.dataToProcess,
monitorStep: input.monitorStep,
didMeetCriteria: didMeetCriteria,
matchMessage: rootCause,
});
const filterSummary: MonitorEvaluationFilterResult = {
checkOn: criteriaFilter.checkOn,
filterType: criteriaFilter.filterType,
value: criteriaFilter.value,
met: didMeetCriteria,
message: filterMessage,
};
input.criteriaResult.filters.push(filterSummary);
if (didMeetCriteria) {
hasMatch = true;
matchedFilterMessages.push(filterMessage);
} else if (filterCondition === FilterCondition.All) {
allFiltersMet = false;
}
}
if (filterCondition === FilterCondition.All) {
if (allFiltersMet && input.criteriaResult.filters.length > 0) {
let message: string = "All filters met.";
if (matchedFilterMessages.length > 0) {
message += matchedFilterMessages
.map((item: string) => {
return `\n- ${item}`;
})
.join("");
}
input.criteriaResult.met = true;
input.criteriaResult.message = message;
return message;
}
input.criteriaResult.met = false;
input.criteriaResult.message =
"One or more filters did not meet the configured conditions.";
return null;
}
if (filterCondition === FilterCondition.Any) {
if (hasMatch) {
const firstMatch: string =
matchedFilterMessages[0] ||
"At least one filter met the configured condition.";
input.criteriaResult.met = true;
input.criteriaResult.message = firstMatch;
return firstMatch;
}
input.criteriaResult.met = false;
input.criteriaResult.message =
"No filters met the configured conditions.";
return null;
}
return null;
}
private static async isMonitorInstanceCriteriaFilterMet(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
probeApiIngestResponse: ProbeApiIngestResponse;
criteriaInstance: MonitorCriteriaInstance;
criteriaFilter: CriteriaFilter;
}): Promise<string | null> {
if (input.criteriaFilter.checkOn === CheckOn.JavaScriptExpression) {
let storageMap: JSONObject = {};
if (
input.monitor.monitorType === MonitorType.API ||
input.monitor.monitorType === MonitorType.Website
) {
let responseBody: JSONObject | null = null;
try {
responseBody = JSON.parse(
((input.dataToProcess as ProbeMonitorResponse)
.responseBody as string) || "{}",
);
} catch (err) {
logger.error(err);
responseBody = (input.dataToProcess as ProbeMonitorResponse)
.responseBody as JSONObject;
}
if (
typeof responseBody === Typeof.String &&
responseBody?.toString() === ""
) {
responseBody = {};
}
storageMap = {
responseBody: responseBody,
responseHeaders: (input.dataToProcess as ProbeMonitorResponse)
.responseHeaders,
responseStatusCode: (input.dataToProcess as ProbeMonitorResponse)
.responseCode,
responseTimeInMs: (input.dataToProcess as ProbeMonitorResponse)
.responseTimeInMs,
isOnline: (input.dataToProcess as ProbeMonitorResponse).isOnline,
};
}
if (input.monitor.monitorType === MonitorType.IncomingRequest) {
storageMap = {
requestBody: (input.dataToProcess as IncomingMonitorRequest)
.requestBody,
requestHeaders: (input.dataToProcess as IncomingMonitorRequest)
.requestHeaders,
};
}
let expression: string = input.criteriaFilter.value as string;
expression = VMUtil.replaceValueInPlace(storageMap, expression, false);
const code: string = `return Boolean(${expression});`;
let result: ReturnResult | null = null;
try {
result = await VMUtil.runCodeInSandbox({
code: code,
options: {
args: {},
},
});
} catch (err) {
logger.error(err);
return null;
}
if (result && result.returnValue) {
return `JavaScript Expression - ${expression} - evaluated to true.`;
}
return null;
}
if (
input.monitor.monitorType === MonitorType.API ||
input.monitor.monitorType === MonitorType.Website ||
input.monitor.monitorType === MonitorType.IP ||
input.monitor.monitorType === MonitorType.Ping ||
input.monitor.monitorType === MonitorType.Port
) {
const apiRequestCriteriaResult: string | null =
await APIRequestCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (apiRequestCriteriaResult) {
return apiRequestCriteriaResult;
}
}
if (
input.monitor.monitorType === MonitorType.CustomJavaScriptCode &&
(input.dataToProcess as ProbeMonitorResponse).customCodeMonitorResponse
) {
const criteriaResult: string | null =
await CustomCodeMonitoringCriteria.isMonitorInstanceCriteriaFilterMet({
monitorResponse: (input.dataToProcess as ProbeMonitorResponse)
.customCodeMonitorResponse!,
criteriaFilter: input.criteriaFilter,
});
if (criteriaResult) {
return criteriaResult;
}
}
if (
input.monitor.monitorType === MonitorType.SyntheticMonitor &&
(input.dataToProcess as ProbeMonitorResponse).syntheticMonitorResponse
) {
const criteriaResult: string | null =
await SyntheticMonitoringCriteria.isMonitorInstanceCriteriaFilterMet({
monitorResponse:
(input.dataToProcess as ProbeMonitorResponse)
.syntheticMonitorResponse || [],
criteriaFilter: input.criteriaFilter,
});
if (criteriaResult) {
return criteriaResult;
}
}
if (input.monitor.monitorType === MonitorType.IncomingRequest) {
const incomingRequestResult: string | null =
await IncomingRequestCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (incomingRequestResult) {
return incomingRequestResult;
}
}
if (input.monitor.monitorType === MonitorType.IncomingEmail) {
const incomingEmailResult: string | null =
await IncomingEmailCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (incomingEmailResult) {
return incomingEmailResult;
}
}
if (input.monitor.monitorType === MonitorType.SSLCertificate) {
const sslMonitorResult: string | null =
await SSLMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (sslMonitorResult) {
return sslMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.Server) {
const serverMonitorResult: string | null =
await ServerMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (serverMonitorResult) {
return serverMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.Logs) {
const logMonitorResult: string | null =
await LogMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (logMonitorResult) {
return logMonitorResult;
}
}
if (
input.monitor.monitorType === MonitorType.Metrics ||
input.monitor.monitorType === MonitorType.Kubernetes
) {
const metricMonitorResult: string | null =
await MetricMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
monitorStep: input.monitorStep,
});
if (metricMonitorResult) {
return metricMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.Traces) {
const traceMonitorResult: string | null =
await TraceMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (traceMonitorResult) {
return traceMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.Exceptions) {
const exceptionMonitorResult: string | null =
await ExceptionMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (exceptionMonitorResult) {
return exceptionMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.Profiles) {
const profileMonitorResult: string | null =
await ProfileMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (profileMonitorResult) {
return profileMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.SNMP) {
const snmpMonitorResult: string | null =
await SnmpMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (snmpMonitorResult) {
return snmpMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.DNS) {
const dnsMonitorResult: string | null =
await DnsMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (dnsMonitorResult) {
return dnsMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.Domain) {
const domainMonitorResult: string | null =
await DomainMonitorCriteria.isMonitorInstanceCriteriaFilterMet({
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
});
if (domainMonitorResult) {
return domainMonitorResult;
}
}
if (input.monitor.monitorType === MonitorType.ExternalStatusPage) {
const externalStatusPageResult: string | null =
await ExternalStatusPageMonitorCriteria.isMonitorInstanceCriteriaFilterMet(
{
dataToProcess: input.dataToProcess,
criteriaFilter: input.criteriaFilter,
},
);
if (externalStatusPageResult) {
return externalStatusPageResult;
}
}
return null;
}
private static buildRootCauseContext(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
}): string | null {
// Handle Kubernetes monitors with rich resource context
if (input.monitor.monitorType === MonitorType.Kubernetes) {
return MonitorCriteriaEvaluator.buildKubernetesRootCauseContext(input);
}
const requestDetails: Array<string> = [];
const responseDetails: Array<string> = [];
const failureDetails: Array<string> = [];
const probeResponse: ProbeMonitorResponse | null =
MonitorCriteriaDataExtractor.getProbeMonitorResponse(input.dataToProcess);
const destination: string | null =
MonitorCriteriaEvaluator.getMonitorDestinationString({
monitorStep: input.monitorStep,
probeResponse: probeResponse,
});
if (destination) {
requestDetails.push(`- Destination: ${destination}`);
}
const port: string | null = MonitorCriteriaEvaluator.getMonitorPortString({
monitorStep: input.monitorStep,
probeResponse: probeResponse,
});
if (port) {
requestDetails.push(`- Destination Port: ${port}`);
}
const requestMethod: string | null =
MonitorCriteriaEvaluator.getRequestMethodString({
monitor: input.monitor,
monitorStep: input.monitorStep,
});
if (requestMethod) {
requestDetails.push(`- Request Method: ${requestMethod}`);
}
if (probeResponse?.responseCode !== undefined) {
responseDetails.push(
`- Response Status Code: ${probeResponse.responseCode}`,
);
}
const responseTime: string | null =
MonitorCriteriaEvaluator.formatMilliseconds(
probeResponse?.responseTimeInMs,
);
if (responseTime) {
responseDetails.push(`- Response Time: ${responseTime}`);
}
if (probeResponse?.isTimeout !== undefined) {
responseDetails.push(
`- Timed Out: ${probeResponse.isTimeout ? "Yes" : "No"}`,
);
}
// Add Request Failed Details if available
if (probeResponse?.requestFailedDetails) {
const requestFailedDetails: RequestFailedDetails =
probeResponse.requestFailedDetails;
if (requestFailedDetails.failedPhase) {
failureDetails.push(
`- Failed Phase: ${requestFailedDetails.failedPhase}`,
);
}
if (requestFailedDetails.errorCode) {
failureDetails.push(`- Error Code: ${requestFailedDetails.errorCode}`);
}
if (requestFailedDetails.errorDescription) {
failureDetails.push(
`- Error Description: ${requestFailedDetails.errorDescription}`,
);
}
if (requestFailedDetails.rawErrorMessage) {
failureDetails.push(
`- Raw Error Message: ${requestFailedDetails.rawErrorMessage}`,
);
}
}
const sections: Array<string> = [];
if (requestDetails.length > 0) {
sections.push(`**Request Details**\n${requestDetails.join("\n")}`);
}
if (responseDetails.length > 0) {
sections.push(`\n\n**Response Snapshot**\n${responseDetails.join("\n")}`);
}
if (failureDetails.length > 0) {
sections.push(
`\n\n**Request Failed Details**\n${failureDetails.join("\n")}`,
);
}
if (!sections.length) {
return null;
}
return sections.join("\n");
}
private static buildKubernetesRootCauseContext(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
}): string | null {
const metricResponse: MetricMonitorResponse =
input.dataToProcess as MetricMonitorResponse;
const breakdown: KubernetesResourceBreakdown | undefined =
metricResponse.kubernetesResourceBreakdown;
if (!breakdown) {
return null;
}
const sections: Array<string> = [];
// Cluster context
const clusterDetails: Array<string> = [];
clusterDetails.push(`- Cluster: ${breakdown.clusterName}`);
clusterDetails.push(
`- Metric: ${breakdown.metricFriendlyName} (\`${breakdown.metricName}\`)`,
);
if (breakdown.attributes["k8s.namespace.name"]) {
clusterDetails.push(
`- Namespace: ${breakdown.attributes["k8s.namespace.name"]}`,
);
}
sections.push(
`**Kubernetes Cluster Details**\n${clusterDetails.join("\n")}`,
);
// Affected resources
if (breakdown.affectedResources && breakdown.affectedResources.length > 0) {
const resourceLines: Array<string> = [];
// Sort by metric value descending (worst first) and filter out zero-value resources
const sortedResources: Array<KubernetesAffectedResource> = [
...breakdown.affectedResources,
]
.filter((r: KubernetesAffectedResource) => {
return r.metricValue > 0;
})
.sort(
(
a: KubernetesAffectedResource,
b: KubernetesAffectedResource,
) => {
return b.metricValue - a.metricValue;
},
);
if (sortedResources.length === 0) {
continue;
}
// Show top 10 affected resources
const resourcesToShow: Array<KubernetesAffectedResource> =
sortedResources.slice(0, 10);
// Determine which columns are present across all resources
const hasNamespace: boolean = resourcesToShow.some(
(r: KubernetesAffectedResource) => r.namespace,
);
const hasWorkload: boolean = resourcesToShow.some(
(r: KubernetesAffectedResource) => r.workloadType && r.workloadName,
);
const hasPod: boolean = resourcesToShow.some(
(r: KubernetesAffectedResource) => r.podName,
);
const hasContainer: boolean = resourcesToShow.some(
(r: KubernetesAffectedResource) => r.containerName,
);
const hasNode: boolean = resourcesToShow.some(
(r: KubernetesAffectedResource) => r.nodeName,
);
// Build table header
const headerCells: Array<string> = [];
if (hasNamespace) {
headerCells.push("Namespace");
}
if (hasWorkload) {
headerCells.push("Workload Type");
headerCells.push("Workload");
}
if (hasPod) {
headerCells.push("Pod");
}
if (hasContainer) {
headerCells.push("Container");
}
if (hasNode) {
headerCells.push("Node");
}
headerCells.push("Value");
const headerRow: string = `| ${headerCells.join(" | ")} |`;
const separatorRow: string = `| ${headerCells.map(() => {
return "---";
}).join(" | ")} |`;
resourceLines.push(headerRow);
resourceLines.push(separatorRow);
for (const resource of resourcesToShow) {
const cells: Array<string> = [];
if (hasNamespace) {
cells.push(resource.namespace ? `\`${resource.namespace}\`` : "-");
}
if (hasWorkload) {
cells.push(
resource.workloadType ? `${resource.workloadType}` : "-",
);
cells.push(
resource.workloadName ? `\`${resource.workloadName}\`` : "-",
);
}
if (hasPod) {
cells.push(resource.podName ? `\`${resource.podName}\`` : "-");
}
if (hasContainer) {
cells.push(
resource.containerName ? `\`${resource.containerName}\`` : "-",
);
}
if (hasNode) {
cells.push(resource.nodeName ? `\`${resource.nodeName}\`` : "-");
}
cells.push(`**${resource.metricValue}**`);
resourceLines.push(`| ${cells.join(" | ")} |`);
}
if (sortedResources.length > 10) {
resourceLines.push(
`\n*... and ${sortedResources.length - 10} more affected resources*`,
);
}
sections.push(
`\n\n**Affected Resources** (${sortedResources.length} total)\n\n${resourceLines.join("\n")}`,
);
// Add root cause analysis based on metric type
const analysis: string | null =
MonitorCriteriaEvaluator.buildKubernetesRootCauseAnalysis({
breakdown: breakdown,
topResource: resourcesToShow[0]!,
});
if (analysis) {
sections.push(`\n\n**Root Cause Analysis**\n${analysis}`);
}
}
return sections.join("\n");
}
private static buildKubernetesRootCauseAnalysis(input: {
breakdown: KubernetesResourceBreakdown;
topResource: KubernetesAffectedResource;
}): string | null {
const { breakdown, topResource } = input;
const metricName: string = breakdown.metricName;
const lines: Array<string> = [];
if (
metricName === "k8s.container.restarts" ||
metricName.includes("restart")
) {
lines.push(
`Container restart count is elevated, indicating a potential CrashLoopBackOff condition.`,
);
if (topResource.containerName) {
lines.push(
`The container \`${topResource.containerName}\` in pod \`${topResource.podName || "unknown"}\` has restarted **${topResource.metricValue}** times.`,
);
}
lines.push(
`Common causes: application crash on startup, misconfigured environment variables, missing dependencies, OOM (Out of Memory) kills, failed health checks, or missing config maps/secrets.`,
);
lines.push(
`Recommended actions: Check container logs with \`kubectl logs ${topResource.podName || "<pod-name>"} -c ${topResource.containerName || "<container>"} --previous\` and inspect events with \`kubectl describe pod ${topResource.podName || "<pod-name>"}\`.`,
);
} else if (
metricName === "k8s.pod.phase" &&
breakdown.attributes["k8s.pod.phase"] === "Pending"
) {
lines.push(`Pods are stuck in Pending phase and unable to be scheduled.`);
lines.push(
`Common causes: insufficient CPU/memory resources on nodes, node affinity/taint restrictions preventing scheduling, PersistentVolumeClaim pending, or resource quota exceeded.`,
);
if (topResource.podName) {
lines.push(
`Recommended actions: Check scheduling events with \`kubectl describe pod ${topResource.podName}\` and verify node resources with \`kubectl describe nodes\`.`,
);
}
} else if (
metricName === "k8s.node.condition_ready" ||
(metricName.includes("node") && metricName.includes("condition"))
) {
lines.push(`One or more nodes have transitioned to a NotReady state.`);
if (topResource.nodeName) {
lines.push(
`Node \`${topResource.nodeName}\` is reporting NotReady (value: ${topResource.metricValue}).`,
);
}
lines.push(
`Common causes: kubelet process failure, node resource exhaustion (disk pressure, memory pressure, PID pressure), network connectivity issues, or underlying VM/hardware failure.`,
);
lines.push(
`Recommended actions: Check node conditions with \`kubectl describe node ${topResource.nodeName || "<node-name>"}\` and verify kubelet status on the node.`,
);
} else if (
metricName === "k8s.node.cpu.utilization" ||
(metricName.includes("cpu") && metricName.includes("utilization"))
) {
lines.push(`Node CPU utilization has exceeded the configured threshold.`);
if (topResource.nodeName) {
lines.push(
`Node \`${topResource.nodeName}\` is at **${topResource.metricValue.toFixed(1)}%** CPU utilization.`,
);
}
lines.push(
`Common causes: resource-intensive workloads, insufficient resource limits on pods, noisy neighbor pods consuming excessive CPU, or insufficient cluster capacity.`,
);
lines.push(
`Recommended actions: Identify top CPU consumers with \`kubectl top pods --all-namespaces --sort-by=cpu\` and consider scaling the cluster or adjusting pod resource limits.`,
);
} else if (
metricName === "k8s.node.memory.usage" ||
(metricName.includes("memory") && metricName.includes("usage"))
) {
lines.push(
`Node memory utilization has exceeded the configured threshold.`,
);
if (topResource.nodeName) {
lines.push(
`Node \`${topResource.nodeName}\` memory usage is at **${topResource.metricValue.toFixed(1)}%**.`,
);
}
lines.push(
`Common causes: memory leaks in applications, insufficient memory limits on pods, too many pods scheduled on the node, or growing dataset sizes.`,
);
lines.push(
`Recommended actions: Check memory consumers with \`kubectl top pods --all-namespaces --sort-by=memory\` and review pod memory limits. Consider scaling the cluster or adding nodes with more memory.`,
);
} else if (
metricName === "k8s.deployment.unavailable_replicas" ||
metricName.includes("unavailable")
) {
lines.push(
`Deployment has unavailable replicas, indicating a mismatch between desired and available replicas.`,
);
if (topResource.workloadName) {
lines.push(
`${topResource.workloadType || "Deployment"} \`${topResource.workloadName}\` has **${topResource.metricValue}** unavailable replica(s).`,
);
}
lines.push(
`Common causes: failed rolling update, image pull errors (wrong image tag or missing registry credentials), pod crash loops, insufficient cluster resources to schedule new pods, or PodDisruptionBudget blocking updates.`,
);
lines.push(
`Recommended actions: Check deployment rollout status with \`kubectl rollout status deployment/${topResource.workloadName || "<deployment>"}\` and inspect pod events.`,
);
} else if (
metricName === "k8s.job.failed_pods" ||
(metricName.includes("job") && metricName.includes("fail"))
) {
lines.push(`Kubernetes Job has failed pods.`);
if (topResource.workloadName) {
lines.push(
`Job \`${topResource.workloadName}\` has **${topResource.metricValue}** failed pod(s).`,
);
}
lines.push(
`Common causes: application error or non-zero exit code, resource limits exceeded (OOMKilled), misconfigured command or arguments, missing environment variables, or timeout exceeded.`,
);
lines.push(
`Recommended actions: Check job status with \`kubectl describe job ${topResource.workloadName || "<job-name>"}\` and review pod logs for the failed pod(s).`,
);
} else if (
metricName === "k8s.node.filesystem.usage" ||
metricName.includes("disk") ||
metricName.includes("filesystem")
) {
lines.push(
`Node disk/filesystem usage has exceeded the configured threshold.`,
);
if (topResource.nodeName) {
lines.push(
`Node \`${topResource.nodeName}\` filesystem usage is at **${topResource.metricValue.toFixed(1)}%**.`,
);
}
lines.push(
`Common causes: container image layers consuming disk space, excessive logging, large emptyDir volumes, or accumulation of unused container images.`,
);
lines.push(
`Recommended actions: Clean up unused images with \`docker system prune\` or \`crictl rmi --prune\`, check for large log files, and review PersistentVolumeClaim usage.`,
);
} else if (
metricName === "k8s.daemonset.misscheduled_nodes" ||
metricName.includes("daemonset")
) {
lines.push(`DaemonSet has misscheduled or unavailable nodes.`);
if (topResource.workloadName) {
lines.push(
`DaemonSet \`${topResource.workloadName}\` has **${topResource.metricValue}** misscheduled node(s).`,
);
}
lines.push(
`Common causes: node taints preventing scheduling, incorrect node selectors, or node affinity rules excluding certain nodes.`,
);
lines.push(
`Recommended actions: Check DaemonSet status with \`kubectl describe daemonset ${topResource.workloadName || "<daemonset>"}\` and verify node labels and taints.`,
);
} else {
// Generic Kubernetes context
lines.push(
`Kubernetes metric \`${metricName}\` (${breakdown.metricFriendlyName}) has breached the configured threshold.`,
);
if (topResource.podName) {
lines.push(`Most affected pod: \`${topResource.podName}\``);
}
if (topResource.nodeName) {
lines.push(`Most affected node: \`${topResource.nodeName}\``);
}
lines.push(
`Recommended actions: Investigate the affected resources using \`kubectl describe\` and \`kubectl logs\` commands.`,
);
}
return lines.join("\n");
}
private static getMonitorDestinationString(input: {
monitorStep: MonitorStep;
probeResponse: ProbeMonitorResponse | null;
}): string | null {
if (input.probeResponse?.monitorDestination) {
return MonitorCriteriaEvaluator.stringifyValue(
input.probeResponse.monitorDestination,
);
}
if (input.monitorStep.data?.monitorDestination) {
return MonitorCriteriaEvaluator.stringifyValue(
input.monitorStep.data.monitorDestination,
);
}
return null;
}
private static getMonitorPortString(input: {
monitorStep: MonitorStep;
probeResponse: ProbeMonitorResponse | null;
}): string | null {
if (input.probeResponse?.monitorDestinationPort) {
return MonitorCriteriaEvaluator.stringifyValue(
input.probeResponse.monitorDestinationPort,
);
}
if (input.monitorStep.data?.monitorDestinationPort) {
return MonitorCriteriaEvaluator.stringifyValue(
input.monitorStep.data.monitorDestinationPort,
);
}
return null;
}
private static getRequestMethodString(input: {
monitor: Monitor;
monitorStep: MonitorStep;
}): string | null {
if (
input.monitor.monitorType === MonitorType.API &&
input.monitorStep.data
) {
return `${input.monitorStep.data.requestType}`;
}
return null;
}
private static formatMilliseconds(value?: number): string | null {
if (value === undefined || value === null || isNaN(value)) {
return null;
}
const formatted: string | null =
MonitorCriteriaMessageFormatter.formatNumber(value, {
maximumFractionDigits: value < 100 ? 2 : value < 1000 ? 1 : 0,
});
return `${formatted ?? value} ms`;
}
private static stringifyValue(value: unknown): string | null {
if (value === null || value === undefined) {
return null;
}
try {
// Handle primitive types directly
if (typeof value === "string") {
return value.trim();
}
if (typeof value === "number" || typeof value === "boolean") {
return String(value);
}
// Handle class instances with custom toString method (like URL, IP, Hostname)
if (
value instanceof URL ||
value instanceof IP ||
value instanceof Hostname ||
value instanceof Port
) {
return value.toString().trim();
}
/*
* Handle JSON representations of URL, IP, Hostname, Port (e.g., { _type: "URL", value: "https://..." })
* This can happen when the value wasn't properly deserialized from JSON
*/
if (typeof value === "object" && value !== null && "_type" in value) {
const typedValue: { _type: string; value?: unknown } = value as {
_type: string;
value?: unknown;
};
if (
(typedValue._type === "URL" ||
typedValue._type === "IP" ||
typedValue._type === "Hostname" ||
typedValue._type === "Port") &&
typeof typedValue.value === "string"
) {
return typedValue.value.trim();
}
}
return String(value).trim();
} catch (err) {
logger.error(err);
return null;
}
}
}