From 70e6924cdd96b01f6a82c06fa6b04113a735eadc Mon Sep 17 00:00:00 2001 From: Nawaz Dhandala Date: Fri, 20 Mar 2026 11:08:47 +0000 Subject: [PATCH] feat(kubernetes): add service mesh and VPA management features - Implemented KubernetesClusterServiceMesh component to display Istio and Linkerd metrics. - Created KubernetesClusterVPADetail component for detailed views of Vertical Pod Autoscalers (VPA). - Added KubernetesClusterVPAs component to list all VPAs in a cluster. - Introduced KubernetesAlertTemplates for monitoring various Kubernetes conditions, including CrashLoopBackOff, Pod Pending, Node Not Ready, and more. - Developed MonitorStepKubernetesMonitor interface for Kubernetes monitoring configurations. --- .../KubernetesMonitorStepForm.tsx | 356 +++++++++ .../Components/Form/Monitor/MonitorStep.tsx | 22 + .../Utils/KubernetesObjectFetcher.ts | 10 +- .../Utils/KubernetesObjectParser.ts | 284 +++++++ .../src/Pages/Kubernetes/View/Alerts.tsx | 208 ++++++ .../src/Pages/Kubernetes/View/HPADetail.tsx | 233 ++++++ .../src/Pages/Kubernetes/View/HPAs.tsx | 164 ++++ .../src/Pages/Kubernetes/View/ServiceMesh.tsx | 272 +++++++ .../src/Pages/Kubernetes/View/SideMenu.tsx | 37 + .../src/Pages/Kubernetes/View/VPADetail.tsx | 220 ++++++ .../src/Pages/Kubernetes/View/VPAs.tsx | 140 ++++ .../Dashboard/src/Routes/KubernetesRoutes.tsx | 84 +++ .../Breadcrumbs/KubernetesBreadcrumbs.ts | 26 + .../src/Utils/Form/Monitor/CriteriaFilter.ts | 2 +- App/FeatureSet/Dashboard/src/Utils/PageMap.ts | 5 + .../Dashboard/src/Utils/RouteMap.ts | 35 + .../Utils/Monitor/MonitorCriteriaEvaluator.ts | 5 +- .../Types/Monitor/KubernetesAlertTemplates.ts | 706 ++++++++++++++++++ .../Types/Monitor/MonitorCriteriaInstance.ts | 86 +++ Common/Types/Monitor/MonitorStep.ts | 37 +- .../Monitor/MonitorStepKubernetesMonitor.ts | 50 ++ Common/Types/Monitor/MonitorType.ts | 24 +- .../templates/configmap-deployment.yaml | 62 +- .../kubernetes-agent/templates/rbac.yaml | 18 + HelmChart/Public/kubernetes-agent/values.yaml | 10 + .../MonitorTelemetryMonitor.ts | 129 ++++ 26 files changed, 3208 insertions(+), 17 deletions(-) create mode 100644 App/FeatureSet/Dashboard/src/Components/Form/Monitor/KubernetesMonitor/KubernetesMonitorStepForm.tsx create mode 100644 App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/Alerts.tsx create mode 100644 App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPADetail.tsx create mode 100644 App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPAs.tsx create mode 100644 App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx create mode 100644 App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPADetail.tsx create mode 100644 App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPAs.tsx create mode 100644 Common/Types/Monitor/KubernetesAlertTemplates.ts create mode 100644 Common/Types/Monitor/MonitorStepKubernetesMonitor.ts diff --git a/App/FeatureSet/Dashboard/src/Components/Form/Monitor/KubernetesMonitor/KubernetesMonitorStepForm.tsx b/App/FeatureSet/Dashboard/src/Components/Form/Monitor/KubernetesMonitor/KubernetesMonitorStepForm.tsx new file mode 100644 index 0000000000..5a5ef22419 --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Components/Form/Monitor/KubernetesMonitor/KubernetesMonitorStepForm.tsx @@ -0,0 +1,356 @@ +import MonitorStepKubernetesMonitor, { + MonitorStepKubernetesMonitorUtil, + KubernetesResourceScope, +} from "Common/Types/Monitor/MonitorStepKubernetesMonitor"; +import React, { FunctionComponent, ReactElement, useEffect } from "react"; +import MetricView from "../../../Metrics/MetricView"; +import RollingTime from "Common/Types/RollingTime/RollingTime"; +import InBetween from "Common/Types/BaseDatabase/InBetween"; +import RollingTimePicker from "Common/UI/Components/RollingTimePicker/RollingTimePicker"; +import RollingTimeUtil from "Common/Types/RollingTime/RollingTimeUtil"; +import FieldLabelElement from "Common/UI/Components/Forms/Fields/FieldLabel"; +import MetricViewData from "Common/Types/Metrics/MetricViewData"; +import Dropdown, { + DropdownOption, + DropdownValue, +} from "Common/UI/Components/Dropdown/Dropdown"; +import Input from "Common/UI/Components/Input/Input"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import ListResult from "Common/Types/BaseDatabase/ListResult"; +import API from "Common/UI/Utils/API/API"; +import { LIMIT_PER_PROJECT } from "Common/Types/Database/LimitMax"; +import SortOrder from "Common/Types/BaseDatabase/SortOrder"; + +export interface ComponentProps { + monitorStepKubernetesMonitor: MonitorStepKubernetesMonitor; + onChange: ( + monitorStepKubernetesMonitor: MonitorStepKubernetesMonitor, + ) => void; +} + +const resourceScopeOptions: Array = [ + { + label: "Cluster", + value: KubernetesResourceScope.Cluster, + }, + { + label: "Namespace", + value: KubernetesResourceScope.Namespace, + }, + { + label: "Workload", + value: KubernetesResourceScope.Workload, + }, + { + label: "Node", + value: KubernetesResourceScope.Node, + }, + { + label: "Pod", + value: KubernetesResourceScope.Pod, + }, +]; + +const KubernetesMonitorStepForm: FunctionComponent = ( + props: ComponentProps, +): ReactElement => { + const [rollingTime, setRollingTime] = React.useState( + null, + ); + + const monitorStepKubernetesMonitor: MonitorStepKubernetesMonitor = + props.monitorStepKubernetesMonitor || + MonitorStepKubernetesMonitorUtil.getDefault(); + + const [startAndEndTime, setStartAndEndTime] = + React.useState | null>(null); + + const [clusterOptions, setClusterOptions] = React.useState< + Array + >([]); + + const [_isLoadingClusters, setIsLoadingClusters] = + React.useState(true); + + useEffect(() => { + // Load clusters + setIsLoadingClusters(true); + ModelAPI.getList({ + modelType: KubernetesCluster, + query: {}, + select: { + _id: true, + name: true, + clusterIdentifier: true, + }, + sort: { + name: SortOrder.Ascending, + }, + limit: LIMIT_PER_PROJECT, + skip: 0, + }) + .then((result: ListResult) => { + const options: Array = result.data.map( + (cluster: KubernetesCluster) => { + return { + label: cluster.name || cluster.clusterIdentifier || "Unknown", + value: cluster.clusterIdentifier || "", + }; + }, + ); + setClusterOptions(options); + }) + .catch((err: Error) => { + // If error, provide empty options + setClusterOptions([]); + API.getFriendlyErrorMessage(err); + }) + .finally(() => { + setIsLoadingClusters(false); + }); + }, []); + + useEffect(() => { + if (rollingTime === monitorStepKubernetesMonitor.rollingTime) { + return; + } + + setRollingTime(monitorStepKubernetesMonitor.rollingTime); + + setStartAndEndTime( + RollingTimeUtil.convertToStartAndEndDate( + monitorStepKubernetesMonitor.rollingTime || RollingTime.Past1Minute, + ), + ); + }, [monitorStepKubernetesMonitor.rollingTime]); + + useEffect(() => { + setStartAndEndTime( + RollingTimeUtil.convertToStartAndEndDate( + monitorStepKubernetesMonitor.rollingTime || RollingTime.Past1Minute, + ), + ); + }, []); + + const showNamespaceFilter: boolean = + monitorStepKubernetesMonitor.resourceScope === + KubernetesResourceScope.Namespace || + monitorStepKubernetesMonitor.resourceScope === + KubernetesResourceScope.Workload || + monitorStepKubernetesMonitor.resourceScope === + KubernetesResourceScope.Pod; + + const showWorkloadFilter: boolean = + monitorStepKubernetesMonitor.resourceScope === + KubernetesResourceScope.Workload; + + const showNodeFilter: boolean = + monitorStepKubernetesMonitor.resourceScope === + KubernetesResourceScope.Node; + + const showPodFilter: boolean = + monitorStepKubernetesMonitor.resourceScope === + KubernetesResourceScope.Pod; + + return ( +
+ + + option.value === monitorStepKubernetesMonitor.clusterIdentifier, + )} + onChange={(value: DropdownValue | Array | null) => { + props.onChange({ + ...monitorStepKubernetesMonitor, + clusterIdentifier: (value as string) || "", + }); + }} + placeholder="Select a cluster..." + /> + +
+ + + + option.value === monitorStepKubernetesMonitor.resourceScope, + )} + onChange={(value: DropdownValue | Array | null) => { + props.onChange({ + ...monitorStepKubernetesMonitor, + resourceScope: + (value as KubernetesResourceScope) || + KubernetesResourceScope.Cluster, + resourceFilters: {}, + }); + }} + placeholder="Select resource scope..." + /> + + {showNamespaceFilter && ( +
+ + { + props.onChange({ + ...monitorStepKubernetesMonitor, + resourceFilters: { + ...monitorStepKubernetesMonitor.resourceFilters, + namespace: value || undefined, + }, + }); + }} + placeholder="e.g. default, production" + /> +
+ )} + + {showWorkloadFilter && ( +
+ + { + props.onChange({ + ...monitorStepKubernetesMonitor, + resourceFilters: { + ...monitorStepKubernetesMonitor.resourceFilters, + workloadName: value || undefined, + }, + }); + }} + placeholder="e.g. my-deployment" + /> +
+ )} + + {showNodeFilter && ( +
+ + { + props.onChange({ + ...monitorStepKubernetesMonitor, + resourceFilters: { + ...monitorStepKubernetesMonitor.resourceFilters, + nodeName: value || undefined, + }, + }); + }} + placeholder="e.g. node-1" + /> +
+ )} + + {showPodFilter && ( +
+ + { + props.onChange({ + ...monitorStepKubernetesMonitor, + resourceFilters: { + ...monitorStepKubernetesMonitor.resourceFilters, + podName: value || undefined, + }, + }); + }} + placeholder="e.g. my-pod-abc123" + /> +
+ )} + +
+ + + { + if (value === monitorStepKubernetesMonitor.rollingTime) { + return; + } + + props.onChange({ + ...monitorStepKubernetesMonitor, + rollingTime: value, + }); + }} + /> + +
+ + + +
+ + { + props.onChange({ + ...monitorStepKubernetesMonitor, + metricViewConfig: { + queryConfigs: data.queryConfigs, + formulaConfigs: data.formulaConfigs, + }, + }); + }} + /> +
+ ); +}; + +export default KubernetesMonitorStepForm; diff --git a/App/FeatureSet/Dashboard/src/Components/Form/Monitor/MonitorStep.tsx b/App/FeatureSet/Dashboard/src/Components/Form/Monitor/MonitorStep.tsx index 77066d5be1..e79c564954 100644 --- a/App/FeatureSet/Dashboard/src/Components/Form/Monitor/MonitorStep.tsx +++ b/App/FeatureSet/Dashboard/src/Components/Form/Monitor/MonitorStep.tsx @@ -67,6 +67,10 @@ import MetricMonitorStepForm from "./MetricMonitor/MetricMonitorStepForm"; import MonitorStepMetricMonitor, { MonitorStepMetricMonitorUtil, } from "Common/Types/Monitor/MonitorStepMetricMonitor"; +import KubernetesMonitorStepForm from "./KubernetesMonitor/KubernetesMonitorStepForm"; +import MonitorStepKubernetesMonitor, { + MonitorStepKubernetesMonitorUtil, +} from "Common/Types/Monitor/MonitorStepKubernetesMonitor"; import Link from "Common/UI/Components/Link/Link"; import TinyFormDocumentation from "Common/UI/Components/TinyFormDocumentation/TinyFormDocumentation"; import ExceptionMonitorStepForm from "./ExceptionMonitor/ExceptionMonitorStepForm"; @@ -742,6 +746,24 @@ return { )} + {props.monitorType === MonitorType.Kubernetes && ( + + { + monitorStep.setKubernetesMonitor(value); + props.onChange?.(MonitorStep.clone(monitorStep)); + }} + /> + + )} + {props.monitorType === MonitorType.Traces && ( ; + }; + status: { + currentReplicas: number; + desiredReplicas: number; + conditions: Array; + }; +} + +export interface KubernetesVPAContainerRecommendation { + containerName: string; + target: Record; + lowerBound: Record; + upperBound: Record; +} + +export interface KubernetesVPAObject { + metadata: KubernetesObjectMetadata; + spec: { + targetRef: { + kind: string; + name: string; + }; + updatePolicy: { + updateMode: string; + }; + resourcePolicy: string; + }; + status: { + recommendation: { + containerRecommendations: Array; + }; + }; +} + /* * ============================================================ * Parsers @@ -1573,6 +1632,231 @@ export function parsePVObject( } } +export function parseHPAObject( + objectKvList: JSONObject, +): KubernetesHPAObject | null { + try { + const metadataKv: string | JSONObject | null = getKvValue( + objectKvList, + "metadata", + ); + if (!metadataKv || typeof metadataKv === "string") { + return null; + } + + const specKv: string | JSONObject | null = getKvValue(objectKvList, "spec"); + const statusKv: string | JSONObject | null = getKvValue( + objectKvList, + "status", + ); + + let minReplicas: number = 0; + let maxReplicas: number = 0; + let scaleTargetRef: { kind: string; name: string } = { + kind: "", + name: "", + }; + const metrics: Array = []; + if (specKv && typeof specKv !== "string") { + minReplicas = parseInt(getKvStringValue(specKv, "minReplicas")) || 0; + maxReplicas = parseInt(getKvStringValue(specKv, "maxReplicas")) || 0; + const targetRefKv: string | JSONObject | null = getKvValue( + specKv, + "scaleTargetRef", + ); + if (targetRefKv && typeof targetRefKv !== "string") { + scaleTargetRef = { + kind: getKvStringValue(targetRefKv, "kind"), + name: getKvStringValue(targetRefKv, "name"), + }; + } + const metricsArrayKv: string | JSONObject | null = getKvValue( + specKv, + "metrics", + ); + if (metricsArrayKv && typeof metricsArrayKv !== "string") { + const metricsItems: Array = getArrayValues(metricsArrayKv); + for (const metricKv of metricsItems) { + const metricType: string = getKvStringValue(metricKv, "type"); + let resourceName: string = ""; + let targetType: string = ""; + let targetValue: string = ""; + const resourceKv: string | JSONObject | null = getKvValue( + metricKv, + "resource", + ); + if (resourceKv && typeof resourceKv !== "string") { + resourceName = getKvStringValue(resourceKv, "name"); + const targetKv: string | JSONObject | null = getKvValue( + resourceKv, + "target", + ); + if (targetKv && typeof targetKv !== "string") { + targetType = getKvStringValue(targetKv, "type"); + targetValue = + getKvStringValue(targetKv, "averageUtilization") || + getKvStringValue(targetKv, "averageValue") || + getKvStringValue(targetKv, "value"); + } + } + metrics.push({ + type: metricType, + resourceName, + targetType, + targetValue, + }); + } + } + } + + let currentReplicas: number = 0; + let desiredReplicas: number = 0; + let conditions: Array = []; + if (statusKv && typeof statusKv !== "string") { + currentReplicas = + parseInt(getKvStringValue(statusKv, "currentReplicas")) || 0; + desiredReplicas = + parseInt(getKvStringValue(statusKv, "desiredReplicas")) || 0; + const condArray: string | JSONObject | null = getKvValue( + statusKv, + "conditions", + ); + if (condArray && typeof condArray !== "string") { + const condItems: Array = getArrayValues(condArray); + conditions = condItems.map( + (condKv: JSONObject): KubernetesHPACondition => { + return { + type: getKvStringValue(condKv, "type"), + status: getKvStringValue(condKv, "status"), + reason: getKvStringValue(condKv, "reason"), + message: getKvStringValue(condKv, "message"), + lastTransitionTime: getKvStringValue( + condKv, + "lastTransitionTime", + ), + }; + }, + ); + } + } + + return { + metadata: parseMetadata(metadataKv), + spec: { minReplicas, maxReplicas, scaleTargetRef, metrics }, + status: { currentReplicas, desiredReplicas, conditions }, + }; + } catch { + return null; + } +} + +export function parseVPAObject( + objectKvList: JSONObject, +): KubernetesVPAObject | null { + try { + const metadataKv: string | JSONObject | null = getKvValue( + objectKvList, + "metadata", + ); + if (!metadataKv || typeof metadataKv === "string") { + return null; + } + + const specKv: string | JSONObject | null = getKvValue(objectKvList, "spec"); + const statusKv: string | JSONObject | null = getKvValue( + objectKvList, + "status", + ); + + let targetRef: { kind: string; name: string } = { kind: "", name: "" }; + let updatePolicy: { updateMode: string } = { updateMode: "" }; + let resourcePolicy: string = ""; + if (specKv && typeof specKv !== "string") { + const targetRefKv: string | JSONObject | null = getKvValue( + specKv, + "targetRef", + ); + if (targetRefKv && typeof targetRefKv !== "string") { + targetRef = { + kind: getKvStringValue(targetRefKv, "kind"), + name: getKvStringValue(targetRefKv, "name"), + }; + } + const updatePolicyKv: string | JSONObject | null = getKvValue( + specKv, + "updatePolicy", + ); + if (updatePolicyKv && typeof updatePolicyKv !== "string") { + updatePolicy = { + updateMode: getKvStringValue(updatePolicyKv, "updateMode"), + }; + } + resourcePolicy = getKvStringValue(specKv, "resourcePolicy"); + } + + const containerRecommendations: Array = + []; + if (statusKv && typeof statusKv !== "string") { + const recommendationKv: string | JSONObject | null = getKvValue( + statusKv, + "recommendation", + ); + if (recommendationKv && typeof recommendationKv !== "string") { + const containerRecsArrayKv: string | JSONObject | null = getKvValue( + recommendationKv, + "containerRecommendations", + ); + if ( + containerRecsArrayKv && + typeof containerRecsArrayKv !== "string" + ) { + const recItems: Array = + getArrayValues(containerRecsArrayKv); + for (const recKv of recItems) { + const targetKv: string | JSONObject | null = getKvValue( + recKv, + "target", + ); + const lowerBoundKv: string | JSONObject | null = getKvValue( + recKv, + "lowerBound", + ); + const upperBoundKv: string | JSONObject | null = getKvValue( + recKv, + "upperBound", + ); + containerRecommendations.push({ + containerName: getKvStringValue(recKv, "containerName"), + target: + targetKv && typeof targetKv !== "string" + ? getKvListAsRecord(targetKv) + : {}, + lowerBound: + lowerBoundKv && typeof lowerBoundKv !== "string" + ? getKvListAsRecord(lowerBoundKv) + : {}, + upperBound: + upperBoundKv && typeof upperBoundKv !== "string" + ? getKvListAsRecord(upperBoundKv) + : {}, + }); + } + } + } + } + + return { + metadata: parseMetadata(metadataKv), + spec: { targetRef, updatePolicy, resourcePolicy }, + status: { + recommendation: { containerRecommendations }, + }, + }; + } catch { + return null; + } +} + /** * Extract the K8s object from a raw OTLP log body string. * For k8sobjects pull mode, the body is: diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/Alerts.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/Alerts.tsx new file mode 100644 index 0000000000..ba62ded383 --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/Alerts.tsx @@ -0,0 +1,208 @@ +import PageComponentProps from "../../PageComponentProps"; +import ObjectID from "Common/Types/ObjectID"; +import Navigation from "Common/UI/Utils/Navigation"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import Card from "Common/UI/Components/Card/Card"; +import React, { + Fragment, + FunctionComponent, + ReactElement, + useEffect, + useState, +} from "react"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import API from "Common/UI/Utils/API/API"; +import PageLoader from "Common/UI/Components/Loader/PageLoader"; +import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; +import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; +import { + getAllKubernetesAlertTemplates, + KubernetesAlertTemplate, + KubernetesAlertTemplateCategory, +} from "Common/Types/Monitor/KubernetesAlertTemplates"; +import IconProp from "Common/Types/Icon/IconProp"; +import Icon from "Common/UI/Components/Icon/Icon"; +import Button, { ButtonStyleType } from "Common/UI/Components/Button/Button"; +import RouteMap from "../../../Utils/RouteMap"; +import PageMap from "../../../Utils/PageMap"; +import Route from "Common/Types/API/Route"; + +const KubernetesClusterAlerts: FunctionComponent< + PageComponentProps +> = (): ReactElement => { + const modelId: ObjectID = Navigation.getLastParamAsObjectID(1); + + const [cluster, setCluster] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(""); + + const fetchCluster: PromiseVoidFunction = async (): Promise => { + setIsLoading(true); + try { + const item: KubernetesCluster | null = await ModelAPI.getItem({ + modelType: KubernetesCluster, + id: modelId, + select: { + clusterIdentifier: true, + name: true, + }, + }); + setCluster(item); + } catch (err) { + setError(API.getFriendlyMessage(err)); + } + setIsLoading(false); + }; + + useEffect(() => { + fetchCluster().catch((err: Error) => { + setError(API.getFriendlyMessage(err)); + }); + }, []); + + if (isLoading) { + return ; + } + + if (error) { + return ; + } + + if (!cluster) { + return ; + } + + const allTemplates: Array = + getAllKubernetesAlertTemplates(); + + const categories: Array = [ + "Workload", + "Node", + "ControlPlane", + "Storage", + "Scheduling", + ]; + + const getCategoryIcon = ( + category: KubernetesAlertTemplateCategory, + ): IconProp => { + switch (category) { + case "Workload": + return IconProp.Cube; + case "Node": + return IconProp.Server; + case "ControlPlane": + return IconProp.Settings; + case "Storage": + return IconProp.Disc; + case "Scheduling": + return IconProp.Clock; + default: + return IconProp.Alert; + } + }; + + const getCategoryDescription = ( + category: KubernetesAlertTemplateCategory, + ): string => { + switch (category) { + case "Workload": + return "Monitor workload health including pod restarts, replica mismatches, and job failures."; + case "Node": + return "Monitor node health including CPU, memory, disk usage, and node readiness."; + case "ControlPlane": + return "Monitor Kubernetes control plane components including etcd, API server, and scheduler."; + case "Storage": + return "Monitor storage resources including disk usage and persistent volume claims."; + case "Scheduling": + return "Monitor pod scheduling including pending pods and scheduler backlog."; + default: + return ""; + } + }; + + return ( + +
+

+ Pre-built alert templates for common Kubernetes failure patterns. Click + "Create Monitor" to set up monitoring for your cluster{" "} + {cluster.name || cluster.clusterIdentifier}. +

+
+ + {categories.map((category: KubernetesAlertTemplateCategory) => { + const categoryTemplates: Array = + allTemplates.filter( + (t: KubernetesAlertTemplate) => t.category === category, + ); + + if (categoryTemplates.length === 0) { + return null; + } + + return ( + + + {category === "ControlPlane" ? "Control Plane" : category} + + } + description={getCategoryDescription(category)} + > +
+ {categoryTemplates.map( + (template: KubernetesAlertTemplate) => { + return ( +
+
+
+ + {template.name} + + + {template.severity} + +
+

+ {template.description} +

+
+
+
+
+ ); + }, + )} +
+
+ ); + })} +
+ ); +}; + +export default KubernetesClusterAlerts; diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPADetail.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPADetail.tsx new file mode 100644 index 0000000000..ddacb696d4 --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPADetail.tsx @@ -0,0 +1,233 @@ +import PageComponentProps from "../../PageComponentProps"; +import ObjectID from "Common/Types/ObjectID"; +import Navigation from "Common/UI/Utils/Navigation"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import Card from "Common/UI/Components/Card/Card"; +import React, { + FunctionComponent, + ReactElement, + useEffect, + useState, +} from "react"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import API from "Common/UI/Utils/API/API"; +import PageLoader from "Common/UI/Components/Loader/PageLoader"; +import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; +import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; +import Tabs from "Common/UI/Components/Tabs/Tabs"; +import { Tab } from "Common/UI/Components/Tabs/Tab"; +import KubernetesOverviewTab from "../../../Components/Kubernetes/KubernetesOverviewTab"; +import KubernetesEventsTab from "../../../Components/Kubernetes/KubernetesEventsTab"; +import { KubernetesHPAObject } from "../Utils/KubernetesObjectParser"; +import { fetchLatestK8sObject } from "../Utils/KubernetesObjectFetcher"; +import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils"; +import KubernetesYamlTab from "../../../Components/Kubernetes/KubernetesYamlTab"; +import StatusBadge, { + StatusBadgeType, +} from "Common/UI/Components/StatusBadge/StatusBadge"; +import KubernetesResourceLink from "../../../Components/Kubernetes/KubernetesResourceLink"; + +const KubernetesClusterHPADetail: FunctionComponent< + PageComponentProps +> = (): ReactElement => { + const modelId: ObjectID = Navigation.getLastParamAsObjectID(2); + const hpaName: string = Navigation.getLastParamAsString(); + + const [cluster, setCluster] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(""); + const [objectData, setObjectData] = + useState(null); + const [isLoadingObject, setIsLoadingObject] = useState(true); + + const fetchCluster: PromiseVoidFunction = async (): Promise => { + setIsLoading(true); + try { + const item: KubernetesCluster | null = await ModelAPI.getItem({ + modelType: KubernetesCluster, + id: modelId, + select: { + clusterIdentifier: true, + }, + }); + setCluster(item); + } catch (err) { + setError(API.getFriendlyMessage(err)); + } + setIsLoading(false); + }; + + useEffect(() => { + fetchCluster().catch((err: Error) => { + setError(API.getFriendlyMessage(err)); + }); + }, []); + + useEffect(() => { + if (!cluster?.clusterIdentifier) { + return; + } + + const fetchObject: () => Promise = async (): Promise => { + setIsLoadingObject(true); + try { + const obj: KubernetesHPAObject | null = + await fetchLatestK8sObject({ + clusterIdentifier: cluster.clusterIdentifier || "", + resourceType: "horizontalpodautoscalers", + resourceName: hpaName, + }); + setObjectData(obj); + } catch { + // Graceful degradation — overview tab shows empty state + } + setIsLoadingObject(false); + }; + + fetchObject().catch(() => {}); + }, [cluster?.clusterIdentifier, hpaName]); + + if (isLoading) { + return ; + } + + if (error) { + return ; + } + + if (!cluster) { + return ; + } + + const clusterIdentifier: string = cluster.clusterIdentifier || ""; + + const summaryFields: Array<{ title: string; value: string | ReactElement }> = + [ + { title: "Name", value: hpaName }, + { title: "Cluster", value: clusterIdentifier }, + ]; + + if (objectData) { + const currentReplicas: number = objectData.status.currentReplicas; + const desiredReplicas: number = objectData.status.desiredReplicas; + const isStable: boolean = currentReplicas === desiredReplicas; + + summaryFields.push( + { + title: "Namespace", + value: objectData.metadata.namespace ? ( + + ) : ( + "default" + ), + }, + { + title: "Target Kind", + value: objectData.spec.scaleTargetRef.kind || "N/A", + }, + { + title: "Target Name", + value: objectData.spec.scaleTargetRef.name || "N/A", + }, + { + title: "Min Replicas", + value: String(objectData.spec.minReplicas), + }, + { + title: "Max Replicas", + value: String(objectData.spec.maxReplicas), + }, + { + title: "Current Replicas", + value: String(currentReplicas), + }, + { + title: "Desired Replicas", + value: String(desiredReplicas), + }, + { + title: "Scaling Status", + value: ( + + ), + }, + { + title: "Created", + value: objectData.metadata.creationTimestamp + ? KubernetesResourceUtils.formatAge( + objectData.metadata.creationTimestamp, + ) + : "N/A", + }, + ); + + if (objectData.spec.metrics.length > 0) { + const metricsDisplay: string = objectData.spec.metrics + .map((m) => { + if (m.resourceName) { + return `${m.resourceName} (${m.targetType}: ${m.targetValue})`; + } + return m.type; + }) + .join(", "); + summaryFields.push({ + title: "Metrics", + value: metricsDisplay || "N/A", + }); + } + } + + const tabs: Array = [ + { + name: "Overview", + children: ( + + ), + }, + { + name: "Events", + children: ( + + + + ), + }, + { + name: "YAML", + children: ( + + ), + }, + ]; + + return {}} />; +}; + +export default KubernetesClusterHPADetail; diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPAs.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPAs.tsx new file mode 100644 index 0000000000..489c727c87 --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/HPAs.tsx @@ -0,0 +1,164 @@ +import PageComponentProps from "../../PageComponentProps"; +import ObjectID from "Common/Types/ObjectID"; +import Navigation from "Common/UI/Utils/Navigation"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import KubernetesResourceTable from "../../../Components/Kubernetes/KubernetesResourceTable"; +import { + KubernetesResource, +} from "../Utils/KubernetesResourceUtils"; +import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils"; +import React, { + FunctionComponent, + ReactElement, + useEffect, + useState, +} from "react"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import API from "Common/UI/Utils/API/API"; +import PageLoader from "Common/UI/Components/Loader/PageLoader"; +import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; +import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; +import PageMap from "../../../Utils/PageMap"; +import RouteMap, { RouteUtil } from "../../../Utils/RouteMap"; +import Route from "Common/Types/API/Route"; +import { + fetchK8sObjectsBatch, + KubernetesObjectType, +} from "../Utils/KubernetesObjectFetcher"; +import { KubernetesHPAObject } from "../Utils/KubernetesObjectParser"; + +const KubernetesClusterHPAs: FunctionComponent< + PageComponentProps +> = (): ReactElement => { + const modelId: ObjectID = Navigation.getLastParamAsObjectID(1); + + const [resources, setResources] = useState>([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(""); + + const fetchData: PromiseVoidFunction = async (): Promise => { + setIsLoading(true); + try { + const cluster: KubernetesCluster | null = await ModelAPI.getItem({ + modelType: KubernetesCluster, + id: modelId, + select: { + clusterIdentifier: true, + }, + }); + + if (!cluster?.clusterIdentifier) { + setError("Cluster not found."); + setIsLoading(false); + return; + } + + const hpaObjects: Map = + await fetchK8sObjectsBatch({ + clusterIdentifier: cluster.clusterIdentifier, + resourceType: "horizontalpodautoscalers", + }); + + const hpaResources: Array = []; + + for (const hpaObj of hpaObjects.values()) { + const hpa: KubernetesHPAObject = hpaObj as KubernetesHPAObject; + + const currentReplicas: number = hpa.status.currentReplicas; + const desiredReplicas: number = hpa.status.desiredReplicas; + + let status: string = "Active"; + if (currentReplicas === desiredReplicas && currentReplicas > 0) { + status = "Active"; + } else if (currentReplicas < desiredReplicas) { + status = "Scaling Up"; + } else if (currentReplicas > desiredReplicas) { + status = "Scaling Down"; + } + + hpaResources.push({ + name: hpa.metadata.name, + namespace: hpa.metadata.namespace || "default", + cpuUtilization: null, + memoryUsageBytes: null, + memoryLimitBytes: null, + status: status, + age: KubernetesResourceUtils.formatAge( + hpa.metadata.creationTimestamp, + ), + additionalAttributes: { + target: `${hpa.spec.scaleTargetRef.kind}/${hpa.spec.scaleTargetRef.name}`, + minReplicas: String(hpa.spec.minReplicas), + maxReplicas: String(hpa.spec.maxReplicas), + currentReplicas: String(currentReplicas), + desiredReplicas: String(desiredReplicas), + }, + }); + } + + setResources(hpaResources); + } catch (err) { + setError(API.getFriendlyMessage(err)); + } + setIsLoading(false); + }; + + useEffect(() => { + fetchData().catch((err: Error) => { + setError(API.getFriendlyMessage(err)); + }); + }, []); + + if (isLoading) { + return ; + } + + if (error) { + return ; + } + + return ( + { + return RouteUtil.populateRouteParams( + RouteMap[ + PageMap.KUBERNETES_CLUSTER_VIEW_HPA_DETAIL + ] as Route, + { + modelId: modelId, + subModelId: new ObjectID(resource.name), + }, + ); + }} + emptyMessage="No HPAs found. HPA data will appear here once the kubernetes-agent Helm chart has resourceSpecs.enabled set to true and includes horizontalpodautoscalers." + /> + ); +}; + +export default KubernetesClusterHPAs; diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx new file mode 100644 index 0000000000..9131cc2fb4 --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx @@ -0,0 +1,272 @@ +import PageComponentProps from "../../PageComponentProps"; +import ObjectID from "Common/Types/ObjectID"; +import Navigation from "Common/UI/Utils/Navigation"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import Card from "Common/UI/Components/Card/Card"; +import MetricView from "../../../Components/Metrics/MetricView"; +import MetricViewData from "Common/Types/Metrics/MetricViewData"; +import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData"; +import AggregationType from "Common/Types/BaseDatabase/AggregationType"; +import OneUptimeDate from "Common/Types/Date"; +import InBetween from "Common/Types/BaseDatabase/InBetween"; +import React, { + Fragment, + FunctionComponent, + ReactElement, + useEffect, + useState, +} from "react"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import API from "Common/UI/Utils/API/API"; +import PageLoader from "Common/UI/Components/Loader/PageLoader"; +import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; +import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; + +const KubernetesClusterServiceMesh: FunctionComponent< + PageComponentProps +> = (): ReactElement => { + const modelId: ObjectID = Navigation.getLastParamAsObjectID(1); + + const [cluster, setCluster] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(""); + const [istioRequestsMetricViewData, setIstioRequestsMetricViewData] = + useState(null); + const [istioLatencyMetricViewData, setIstioLatencyMetricViewData] = + useState(null); + const [linkerdRequestsMetricViewData, setLinkerdRequestsMetricViewData] = + useState(null); + const [linkerdLatencyMetricViewData, setLinkerdLatencyMetricViewData] = + useState(null); + + const fetchCluster: PromiseVoidFunction = async (): Promise => { + setIsLoading(true); + try { + const item: KubernetesCluster | null = await ModelAPI.getItem({ + modelType: KubernetesCluster, + id: modelId, + select: { + clusterIdentifier: true, + }, + }); + setCluster(item); + } catch (err) { + setError(API.getFriendlyMessage(err)); + } + setIsLoading(false); + }; + + useEffect(() => { + fetchCluster().catch((err: Error) => { + setError(API.getFriendlyMessage(err)); + }); + }, []); + + useEffect(() => { + if (!cluster) { + return; + } + + const clusterIdentifier: string = cluster.clusterIdentifier || ""; + const endDate: Date = OneUptimeDate.getCurrentDate(); + const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6); + const startAndEndDate: InBetween = new InBetween(startDate, endDate); + + // Istio metrics + const istioRequestsTotalQuery: MetricQueryConfigData = { + metricAliasData: { + metricVariable: "istio_requests_total", + title: "Istio Request Rate", + description: "Total requests through the Istio service mesh", + legend: "Requests", + legendUnit: "req/s", + }, + metricQueryData: { + filterData: { + metricName: "istio_requests_total", + attributes: { + "resource.k8s.cluster.name": clusterIdentifier, + }, + aggegationType: AggregationType.Sum, + aggregateBy: {}, + }, + groupBy: { + attributes: true, + }, + }, + }; + + const istioRequestDurationQuery: MetricQueryConfigData = { + metricAliasData: { + metricVariable: "istio_request_duration", + title: "Istio Request Latency", + description: + "Request duration through the Istio service mesh (p50/p99)", + legend: "Latency", + legendUnit: "ms", + }, + metricQueryData: { + filterData: { + metricName: "istio_request_duration_milliseconds_bucket", + attributes: { + "resource.k8s.cluster.name": clusterIdentifier, + }, + aggegationType: AggregationType.Avg, + aggregateBy: {}, + }, + groupBy: { + attributes: true, + }, + }, + }; + + // Linkerd metrics + const linkerdRequestTotalQuery: MetricQueryConfigData = { + metricAliasData: { + metricVariable: "linkerd_request_total", + title: "Linkerd Request Rate", + description: "Total requests through the Linkerd service mesh", + legend: "Requests", + legendUnit: "req/s", + }, + metricQueryData: { + filterData: { + metricName: "request_total", + attributes: { + "resource.k8s.cluster.name": clusterIdentifier, + }, + aggegationType: AggregationType.Sum, + aggregateBy: {}, + }, + groupBy: { + attributes: true, + }, + }, + }; + + const linkerdResponseLatencyQuery: MetricQueryConfigData = { + metricAliasData: { + metricVariable: "linkerd_response_latency", + title: "Linkerd Response Latency", + description: + "Response latency through the Linkerd service mesh (p50/p99)", + legend: "Latency", + legendUnit: "ms", + }, + metricQueryData: { + filterData: { + metricName: "response_latency_ms_bucket", + attributes: { + "resource.k8s.cluster.name": clusterIdentifier, + }, + aggegationType: AggregationType.Avg, + aggregateBy: {}, + }, + groupBy: { + attributes: true, + }, + }, + }; + + setIstioRequestsMetricViewData({ + startAndEndDate: startAndEndDate, + queryConfigs: [istioRequestsTotalQuery], + formulaConfigs: [], + }); + + setIstioLatencyMetricViewData({ + startAndEndDate: startAndEndDate, + queryConfigs: [istioRequestDurationQuery], + formulaConfigs: [], + }); + + setLinkerdRequestsMetricViewData({ + startAndEndDate: startAndEndDate, + queryConfigs: [linkerdRequestTotalQuery], + formulaConfigs: [], + }); + + setLinkerdLatencyMetricViewData({ + startAndEndDate: startAndEndDate, + queryConfigs: [linkerdResponseLatencyQuery], + formulaConfigs: [], + }); + }, [cluster]); + + if (isLoading) { + return ; + } + + if (error) { + return ; + } + + if ( + !cluster || + !istioRequestsMetricViewData || + !istioLatencyMetricViewData || + !linkerdRequestsMetricViewData || + !linkerdLatencyMetricViewData + ) { + return ; + } + + return ( + +
+

+ Service mesh metrics require the serviceMesh.enabled flag + to be set to true and the serviceMesh.provider{" "} + to be configured in the kubernetes-agent Helm chart values. Supported + providers are Istio and Linkerd. +

+
+ + + {}} + /> + + + + {}} + /> + + + + {}} + /> + + + + {}} + /> + +
+ ); +}; + +export default KubernetesClusterServiceMesh; diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/SideMenu.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/SideMenu.tsx index 360f74daee..1cb3c3ac2a 100644 --- a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/SideMenu.tsx +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/SideMenu.tsx @@ -20,6 +20,8 @@ export interface ResourceCounts { containers?: number | undefined; pvcs?: number | undefined; pvs?: number | undefined; + hpas?: number | undefined; + vpas?: number | undefined; } export interface ComponentProps { @@ -188,6 +190,31 @@ const KubernetesClusterSideMenu: FunctionComponent = ( /> + + + + + = ( }} icon={IconProp.Activity} /> + diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPADetail.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPADetail.tsx new file mode 100644 index 0000000000..26be1da63b --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPADetail.tsx @@ -0,0 +1,220 @@ +import PageComponentProps from "../../PageComponentProps"; +import ObjectID from "Common/Types/ObjectID"; +import Navigation from "Common/UI/Utils/Navigation"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import Card from "Common/UI/Components/Card/Card"; +import React, { + FunctionComponent, + ReactElement, + useEffect, + useState, +} from "react"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import API from "Common/UI/Utils/API/API"; +import PageLoader from "Common/UI/Components/Loader/PageLoader"; +import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; +import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; +import Tabs from "Common/UI/Components/Tabs/Tabs"; +import { Tab } from "Common/UI/Components/Tabs/Tab"; +import KubernetesOverviewTab from "../../../Components/Kubernetes/KubernetesOverviewTab"; +import KubernetesEventsTab from "../../../Components/Kubernetes/KubernetesEventsTab"; +import { KubernetesVPAObject } from "../Utils/KubernetesObjectParser"; +import { fetchLatestK8sObject } from "../Utils/KubernetesObjectFetcher"; +import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils"; +import KubernetesYamlTab from "../../../Components/Kubernetes/KubernetesYamlTab"; +import StatusBadge, { + StatusBadgeType, +} from "Common/UI/Components/StatusBadge/StatusBadge"; +import KubernetesResourceLink from "../../../Components/Kubernetes/KubernetesResourceLink"; + +const KubernetesClusterVPADetail: FunctionComponent< + PageComponentProps +> = (): ReactElement => { + const modelId: ObjectID = Navigation.getLastParamAsObjectID(2); + const vpaName: string = Navigation.getLastParamAsString(); + + const [cluster, setCluster] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(""); + const [objectData, setObjectData] = + useState(null); + const [isLoadingObject, setIsLoadingObject] = useState(true); + + const fetchCluster: PromiseVoidFunction = async (): Promise => { + setIsLoading(true); + try { + const item: KubernetesCluster | null = await ModelAPI.getItem({ + modelType: KubernetesCluster, + id: modelId, + select: { + clusterIdentifier: true, + }, + }); + setCluster(item); + } catch (err) { + setError(API.getFriendlyMessage(err)); + } + setIsLoading(false); + }; + + useEffect(() => { + fetchCluster().catch((err: Error) => { + setError(API.getFriendlyMessage(err)); + }); + }, []); + + useEffect(() => { + if (!cluster?.clusterIdentifier) { + return; + } + + const fetchObject: () => Promise = async (): Promise => { + setIsLoadingObject(true); + try { + const obj: KubernetesVPAObject | null = + await fetchLatestK8sObject({ + clusterIdentifier: cluster.clusterIdentifier || "", + resourceType: "verticalpodautoscalers", + resourceName: vpaName, + }); + setObjectData(obj); + } catch { + // Graceful degradation — overview tab shows empty state + } + setIsLoadingObject(false); + }; + + fetchObject().catch(() => {}); + }, [cluster?.clusterIdentifier, vpaName]); + + if (isLoading) { + return ; + } + + if (error) { + return ; + } + + if (!cluster) { + return ; + } + + const clusterIdentifier: string = cluster.clusterIdentifier || ""; + + const summaryFields: Array<{ title: string; value: string | ReactElement }> = + [ + { title: "Name", value: vpaName }, + { title: "Cluster", value: clusterIdentifier }, + ]; + + if (objectData) { + const hasRecommendations: boolean = + objectData.status.recommendation.containerRecommendations.length > 0; + + summaryFields.push( + { + title: "Namespace", + value: objectData.metadata.namespace ? ( + + ) : ( + "default" + ), + }, + { + title: "Target Kind", + value: objectData.spec.targetRef.kind || "N/A", + }, + { + title: "Target Name", + value: objectData.spec.targetRef.name || "N/A", + }, + { + title: "Update Mode", + value: objectData.spec.updatePolicy.updateMode || "N/A", + }, + { + title: "Status", + value: ( + + ), + }, + { + title: "Created", + value: objectData.metadata.creationTimestamp + ? KubernetesResourceUtils.formatAge( + objectData.metadata.creationTimestamp, + ) + : "N/A", + }, + ); + + // Add container recommendations + if (hasRecommendations) { + for (const rec of objectData.status.recommendation + .containerRecommendations) { + const targetCpu: string = rec.target["cpu"] || "N/A"; + const targetMemory: string = rec.target["memory"] || "N/A"; + summaryFields.push({ + title: `Recommendation (${rec.containerName})`, + value: `CPU: ${targetCpu}, Memory: ${targetMemory}`, + }); + } + } + } + + const tabs: Array = [ + { + name: "Overview", + children: ( + + ), + }, + { + name: "Events", + children: ( + + + + ), + }, + { + name: "YAML", + children: ( + + ), + }, + ]; + + return {}} />; +}; + +export default KubernetesClusterVPADetail; diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPAs.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPAs.tsx new file mode 100644 index 0000000000..3cea4c38d3 --- /dev/null +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/VPAs.tsx @@ -0,0 +1,140 @@ +import PageComponentProps from "../../PageComponentProps"; +import ObjectID from "Common/Types/ObjectID"; +import Navigation from "Common/UI/Utils/Navigation"; +import KubernetesCluster from "Common/Models/DatabaseModels/KubernetesCluster"; +import KubernetesResourceTable from "../../../Components/Kubernetes/KubernetesResourceTable"; +import { + KubernetesResource, +} from "../Utils/KubernetesResourceUtils"; +import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils"; +import React, { + FunctionComponent, + ReactElement, + useEffect, + useState, +} from "react"; +import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI"; +import API from "Common/UI/Utils/API/API"; +import PageLoader from "Common/UI/Components/Loader/PageLoader"; +import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; +import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; +import PageMap from "../../../Utils/PageMap"; +import RouteMap, { RouteUtil } from "../../../Utils/RouteMap"; +import Route from "Common/Types/API/Route"; +import { + fetchK8sObjectsBatch, + KubernetesObjectType, +} from "../Utils/KubernetesObjectFetcher"; +import { KubernetesVPAObject } from "../Utils/KubernetesObjectParser"; + +const KubernetesClusterVPAs: FunctionComponent< + PageComponentProps +> = (): ReactElement => { + const modelId: ObjectID = Navigation.getLastParamAsObjectID(1); + + const [resources, setResources] = useState>([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(""); + + const fetchData: PromiseVoidFunction = async (): Promise => { + setIsLoading(true); + try { + const cluster: KubernetesCluster | null = await ModelAPI.getItem({ + modelType: KubernetesCluster, + id: modelId, + select: { + clusterIdentifier: true, + }, + }); + + if (!cluster?.clusterIdentifier) { + setError("Cluster not found."); + setIsLoading(false); + return; + } + + const vpaObjects: Map = + await fetchK8sObjectsBatch({ + clusterIdentifier: cluster.clusterIdentifier, + resourceType: "verticalpodautoscalers", + }); + + const vpaResources: Array = []; + + for (const vpaObj of vpaObjects.values()) { + const vpa: KubernetesVPAObject = vpaObj as KubernetesVPAObject; + + const hasRecommendations: boolean = + vpa.status.recommendation.containerRecommendations.length > 0; + + vpaResources.push({ + name: vpa.metadata.name, + namespace: vpa.metadata.namespace || "default", + cpuUtilization: null, + memoryUsageBytes: null, + memoryLimitBytes: null, + status: hasRecommendations ? "Active" : "Pending", + age: KubernetesResourceUtils.formatAge( + vpa.metadata.creationTimestamp, + ), + additionalAttributes: { + target: `${vpa.spec.targetRef.kind}/${vpa.spec.targetRef.name}`, + updateMode: vpa.spec.updatePolicy.updateMode || "N/A", + }, + }); + } + + setResources(vpaResources); + } catch (err) { + setError(API.getFriendlyMessage(err)); + } + setIsLoading(false); + }; + + useEffect(() => { + fetchData().catch((err: Error) => { + setError(API.getFriendlyMessage(err)); + }); + }, []); + + if (isLoading) { + return ; + } + + if (error) { + return ; + } + + return ( + { + return RouteUtil.populateRouteParams( + RouteMap[ + PageMap.KUBERNETES_CLUSTER_VIEW_VPA_DETAIL + ] as Route, + { + modelId: modelId, + subModelId: new ObjectID(resource.name), + }, + ); + }} + emptyMessage="No VPAs found. VPA data will appear here once the kubernetes-agent Helm chart has resourceSpecs.enabled set to true and includes verticalpodautoscalers." + /> + ); +}; + +export default KubernetesClusterVPAs; diff --git a/App/FeatureSet/Dashboard/src/Routes/KubernetesRoutes.tsx b/App/FeatureSet/Dashboard/src/Routes/KubernetesRoutes.tsx index 9b35382c3c..3a121bd6f6 100644 --- a/App/FeatureSet/Dashboard/src/Routes/KubernetesRoutes.tsx +++ b/App/FeatureSet/Dashboard/src/Routes/KubernetesRoutes.tsx @@ -32,8 +32,13 @@ import KubernetesClusterViewPVCs from "../Pages/Kubernetes/View/PersistentVolume import KubernetesClusterViewPVCDetail from "../Pages/Kubernetes/View/PVCDetail"; import KubernetesClusterViewPVs from "../Pages/Kubernetes/View/PersistentVolumes"; import KubernetesClusterViewPVDetail from "../Pages/Kubernetes/View/PVDetail"; +import KubernetesClusterViewHPAs from "../Pages/Kubernetes/View/HPAs"; +import KubernetesClusterViewHPADetail from "../Pages/Kubernetes/View/HPADetail"; +import KubernetesClusterViewVPAs from "../Pages/Kubernetes/View/VPAs"; +import KubernetesClusterViewVPADetail from "../Pages/Kubernetes/View/VPADetail"; import KubernetesClusterViewEvents from "../Pages/Kubernetes/View/Events"; import KubernetesClusterViewControlPlane from "../Pages/Kubernetes/View/ControlPlane"; +import KubernetesClusterViewServiceMesh from "../Pages/Kubernetes/View/ServiceMesh"; import KubernetesClusterViewDelete from "../Pages/Kubernetes/View/Delete"; import KubernetesClusterViewSettings from "../Pages/Kubernetes/View/Settings"; import KubernetesClusterViewDocumentation from "../Pages/Kubernetes/View/Documentation"; @@ -425,6 +430,70 @@ const KubernetesRoutes: FunctionComponent = ( } /> + {/* HPAs */} + + } + /> + + + } + /> + + {/* VPAs */} + + } + /> + + + } + /> + {/* Events */} = ( } /> + {/* Service Mesh */} + + } + /> + {/* Settings */} { return i.value === CheckOn.MetricValue; }); diff --git a/App/FeatureSet/Dashboard/src/Utils/PageMap.ts b/App/FeatureSet/Dashboard/src/Utils/PageMap.ts index 9da1af3e65..b247ed6fb8 100644 --- a/App/FeatureSet/Dashboard/src/Utils/PageMap.ts +++ b/App/FeatureSet/Dashboard/src/Utils/PageMap.ts @@ -242,8 +242,13 @@ enum PageMap { KUBERNETES_CLUSTER_VIEW_PVC_DETAIL = "KUBERNETES_CLUSTER_VIEW_PVC_DETAIL", KUBERNETES_CLUSTER_VIEW_PVS = "KUBERNETES_CLUSTER_VIEW_PVS", KUBERNETES_CLUSTER_VIEW_PV_DETAIL = "KUBERNETES_CLUSTER_VIEW_PV_DETAIL", + KUBERNETES_CLUSTER_VIEW_HPAS = "KUBERNETES_CLUSTER_VIEW_HPAS", + KUBERNETES_CLUSTER_VIEW_HPA_DETAIL = "KUBERNETES_CLUSTER_VIEW_HPA_DETAIL", + KUBERNETES_CLUSTER_VIEW_VPAS = "KUBERNETES_CLUSTER_VIEW_VPAS", + KUBERNETES_CLUSTER_VIEW_VPA_DETAIL = "KUBERNETES_CLUSTER_VIEW_VPA_DETAIL", KUBERNETES_CLUSTER_VIEW_EVENTS = "KUBERNETES_CLUSTER_VIEW_EVENTS", KUBERNETES_CLUSTER_VIEW_CONTROL_PLANE = "KUBERNETES_CLUSTER_VIEW_CONTROL_PLANE", + KUBERNETES_CLUSTER_VIEW_SERVICE_MESH = "KUBERNETES_CLUSTER_VIEW_SERVICE_MESH", KUBERNETES_CLUSTER_VIEW_DELETE = "KUBERNETES_CLUSTER_VIEW_DELETE", KUBERNETES_CLUSTER_VIEW_SETTINGS = "KUBERNETES_CLUSTER_VIEW_SETTINGS", KUBERNETES_CLUSTER_VIEW_DOCUMENTATION = "KUBERNETES_CLUSTER_VIEW_DOCUMENTATION", diff --git a/App/FeatureSet/Dashboard/src/Utils/RouteMap.ts b/App/FeatureSet/Dashboard/src/Utils/RouteMap.ts index e2be4229bf..72fb99d3bb 100644 --- a/App/FeatureSet/Dashboard/src/Utils/RouteMap.ts +++ b/App/FeatureSet/Dashboard/src/Utils/RouteMap.ts @@ -83,8 +83,13 @@ export const KubernetesRoutePath: Dictionary = { [PageMap.KUBERNETES_CLUSTER_VIEW_PVC_DETAIL]: `${RouteParams.ModelID}/pvcs/${RouteParams.SubModelID}`, [PageMap.KUBERNETES_CLUSTER_VIEW_PVS]: `${RouteParams.ModelID}/pvs`, [PageMap.KUBERNETES_CLUSTER_VIEW_PV_DETAIL]: `${RouteParams.ModelID}/pvs/${RouteParams.SubModelID}`, + [PageMap.KUBERNETES_CLUSTER_VIEW_HPAS]: `${RouteParams.ModelID}/hpas`, + [PageMap.KUBERNETES_CLUSTER_VIEW_HPA_DETAIL]: `${RouteParams.ModelID}/hpas/${RouteParams.SubModelID}`, + [PageMap.KUBERNETES_CLUSTER_VIEW_VPAS]: `${RouteParams.ModelID}/vpas`, + [PageMap.KUBERNETES_CLUSTER_VIEW_VPA_DETAIL]: `${RouteParams.ModelID}/vpas/${RouteParams.SubModelID}`, [PageMap.KUBERNETES_CLUSTER_VIEW_EVENTS]: `${RouteParams.ModelID}/events`, [PageMap.KUBERNETES_CLUSTER_VIEW_CONTROL_PLANE]: `${RouteParams.ModelID}/control-plane`, + [PageMap.KUBERNETES_CLUSTER_VIEW_SERVICE_MESH]: `${RouteParams.ModelID}/service-mesh`, [PageMap.KUBERNETES_CLUSTER_VIEW_DELETE]: `${RouteParams.ModelID}/delete`, [PageMap.KUBERNETES_CLUSTER_VIEW_SETTINGS]: `${RouteParams.ModelID}/settings`, [PageMap.KUBERNETES_CLUSTER_VIEW_DOCUMENTATION]: `${RouteParams.ModelID}/documentation`, @@ -1649,6 +1654,30 @@ const RouteMap: Dictionary = { }`, ), + [PageMap.KUBERNETES_CLUSTER_VIEW_HPAS]: new Route( + `/dashboard/${RouteParams.ProjectID}/kubernetes/${ + KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_HPAS] + }`, + ), + + [PageMap.KUBERNETES_CLUSTER_VIEW_HPA_DETAIL]: new Route( + `/dashboard/${RouteParams.ProjectID}/kubernetes/${ + KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_HPA_DETAIL] + }`, + ), + + [PageMap.KUBERNETES_CLUSTER_VIEW_VPAS]: new Route( + `/dashboard/${RouteParams.ProjectID}/kubernetes/${ + KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_VPAS] + }`, + ), + + [PageMap.KUBERNETES_CLUSTER_VIEW_VPA_DETAIL]: new Route( + `/dashboard/${RouteParams.ProjectID}/kubernetes/${ + KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_VPA_DETAIL] + }`, + ), + [PageMap.KUBERNETES_CLUSTER_VIEW_EVENTS]: new Route( `/dashboard/${RouteParams.ProjectID}/kubernetes/${ KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_EVENTS] @@ -1661,6 +1690,12 @@ const RouteMap: Dictionary = { }`, ), + [PageMap.KUBERNETES_CLUSTER_VIEW_SERVICE_MESH]: new Route( + `/dashboard/${RouteParams.ProjectID}/kubernetes/${ + KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_SERVICE_MESH] + }`, + ), + [PageMap.KUBERNETES_CLUSTER_VIEW_DELETE]: new Route( `/dashboard/${RouteParams.ProjectID}/kubernetes/${ KubernetesRoutePath[PageMap.KUBERNETES_CLUSTER_VIEW_DELETE] diff --git a/Common/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts b/Common/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts index d2b9de0ac9..ac24ea9c4f 100644 --- a/Common/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts +++ b/Common/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts @@ -447,7 +447,10 @@ ${contextBlock} } } - if (input.monitor.monitorType === MonitorType.Metrics) { + if ( + input.monitor.monitorType === MonitorType.Metrics || + input.monitor.monitorType === MonitorType.Kubernetes + ) { const metricMonitorResult: string | null = await MetricMonitorCriteria.isMonitorInstanceCriteriaFilterMet({ dataToProcess: input.dataToProcess, diff --git a/Common/Types/Monitor/KubernetesAlertTemplates.ts b/Common/Types/Monitor/KubernetesAlertTemplates.ts new file mode 100644 index 0000000000..0c13f96f95 --- /dev/null +++ b/Common/Types/Monitor/KubernetesAlertTemplates.ts @@ -0,0 +1,706 @@ +import ObjectID from "../ObjectID"; +import MonitorStep from "./MonitorStep"; +import MonitorCriteria from "./MonitorCriteria"; +import MonitorCriteriaInstance from "./MonitorCriteriaInstance"; +import FilterCondition from "../Filter/FilterCondition"; +import { + CheckOn, + FilterType, + EvaluateOverTimeType, +} from "./CriteriaFilter"; +import MonitorStepKubernetesMonitor, { + KubernetesResourceScope, +} from "./MonitorStepKubernetesMonitor"; +import RollingTime from "../RollingTime/RollingTime"; +import MetricsAggregationType from "../Metrics/MetricsAggregationType"; + +export type KubernetesAlertTemplateCategory = + | "Workload" + | "Node" + | "ControlPlane" + | "Storage" + | "Scheduling"; + +export type KubernetesAlertTemplateSeverity = "Critical" | "Warning"; + +export interface KubernetesAlertTemplateArgs { + clusterIdentifier: string; + onlineMonitorStatusId: ObjectID; + offlineMonitorStatusId: ObjectID; + defaultIncidentSeverityId: ObjectID; + defaultAlertSeverityId: ObjectID; + monitorName: string; +} + +export interface KubernetesAlertTemplate { + id: string; + name: string; + description: string; + category: KubernetesAlertTemplateCategory; + severity: KubernetesAlertTemplateSeverity; + getMonitorStep: (args: KubernetesAlertTemplateArgs) => MonitorStep; +} + +function buildKubernetesMonitorStep(args: { + kubernetesMonitor: MonitorStepKubernetesMonitor; + offlineCriteriaInstance: MonitorCriteriaInstance; + onlineCriteriaInstance: MonitorCriteriaInstance; +}): MonitorStep { + const monitorStep: MonitorStep = new MonitorStep(); + + const monitorCriteria: MonitorCriteria = new MonitorCriteria(); + + monitorCriteria.data = { + monitorCriteriaInstanceArray: [ + args.offlineCriteriaInstance, + args.onlineCriteriaInstance, + ], + }; + + monitorStep.data = { + id: ObjectID.generate().toString(), + monitorDestination: undefined, + doNotFollowRedirects: undefined, + monitorDestinationPort: undefined, + monitorCriteria: monitorCriteria, + requestType: "GET" as any, + requestHeaders: undefined, + requestBody: undefined, + customCode: undefined, + screenSizeTypes: undefined, + browserTypes: undefined, + retryCountOnError: undefined, + logMonitor: undefined, + traceMonitor: undefined, + metricMonitor: undefined, + exceptionMonitor: undefined, + snmpMonitor: undefined, + dnsMonitor: undefined, + domainMonitor: undefined, + externalStatusPageMonitor: undefined, + kubernetesMonitor: args.kubernetesMonitor, + }; + + return monitorStep; +} + +function buildOfflineCriteriaInstance(args: { + offlineMonitorStatusId: ObjectID; + incidentSeverityId: ObjectID; + alertSeverityId: ObjectID; + monitorName: string; + metricAlias: string; + filterType: FilterType; + value: number; +}): MonitorCriteriaInstance { + const instance: MonitorCriteriaInstance = new MonitorCriteriaInstance(); + + instance.data = { + id: ObjectID.generate().toString(), + monitorStatusId: args.offlineMonitorStatusId, + filterCondition: FilterCondition.Any, + filters: [ + { + checkOn: CheckOn.MetricValue, + filterType: args.filterType, + metricMonitorOptions: { + metricAggregationType: EvaluateOverTimeType.AnyValue, + metricAlias: args.metricAlias, + }, + value: args.value, + }, + ], + incidents: [ + { + title: `${args.monitorName} - Alert Triggered`, + description: `${args.monitorName} has triggered an alert condition.`, + incidentSeverityId: args.incidentSeverityId, + autoResolveIncident: true, + id: ObjectID.generate().toString(), + onCallPolicyIds: [], + }, + ], + alerts: [ + { + title: `${args.monitorName} - Alert`, + description: `${args.monitorName} has triggered an alert condition.`, + alertSeverityId: args.alertSeverityId, + autoResolveAlert: true, + id: ObjectID.generate().toString(), + onCallPolicyIds: [], + }, + ], + changeMonitorStatus: true, + createIncidents: true, + createAlerts: true, + name: `${args.monitorName} - Unhealthy`, + description: `Criteria for detecting unhealthy state.`, + }; + + return instance; +} + +function buildOnlineCriteriaInstance(args: { + onlineMonitorStatusId: ObjectID; + metricAlias: string; + filterType: FilterType; + value: number; +}): MonitorCriteriaInstance { + const instance: MonitorCriteriaInstance = new MonitorCriteriaInstance(); + + instance.data = { + id: ObjectID.generate().toString(), + monitorStatusId: args.onlineMonitorStatusId, + filterCondition: FilterCondition.Any, + filters: [ + { + checkOn: CheckOn.MetricValue, + filterType: args.filterType, + metricMonitorOptions: { + metricAggregationType: EvaluateOverTimeType.AnyValue, + metricAlias: args.metricAlias, + }, + value: args.value, + }, + ], + incidents: [], + alerts: [], + changeMonitorStatus: true, + createIncidents: false, + createAlerts: false, + name: "Healthy", + description: "Criteria for healthy state.", + }; + + return instance; +} + +function buildKubernetesMonitorConfig(args: { + clusterIdentifier: string; + metricName: string; + metricAlias: string; + resourceScope: KubernetesResourceScope; + rollingTime: RollingTime; + aggregationType: MetricsAggregationType; + attributes?: Record; +}): MonitorStepKubernetesMonitor { + return { + clusterIdentifier: args.clusterIdentifier, + resourceScope: args.resourceScope, + resourceFilters: {}, + metricViewConfig: { + queryConfigs: [ + { + metricAliasData: { + metricVariable: args.metricAlias, + title: args.metricAlias, + description: args.metricAlias, + legend: args.metricAlias, + legendUnit: undefined, + }, + metricQueryData: { + filterData: { + metricName: args.metricName, + attributes: args.attributes || {}, + aggegationType: args.aggregationType, + aggregateBy: {}, + }, + }, + }, + ], + formulaConfigs: [], + }, + rollingTime: args.rollingTime, + }; +} + +// --- Template Definitions --- + +const crashLoopBackOffTemplate: KubernetesAlertTemplate = { + id: "k8s-crashloopbackoff", + name: "CrashLoopBackOff Detection", + description: + "Alert when container restart count exceeds threshold, indicating a CrashLoopBackOff condition.", + category: "Workload", + severity: "Critical", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "container_restarts"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.container.restarts", + metricAlias, + resourceScope: KubernetesResourceScope.Cluster, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Max, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 5, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.LessThanOrEqualTo, + value: 5, + }), + }); + }, +}; + +const podPendingTemplate: KubernetesAlertTemplate = { + id: "k8s-pod-pending", + name: "Pod Stuck in Pending", + description: + "Alert when pods remain in Pending phase, indicating scheduling or resource issues.", + category: "Scheduling", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "pending_pods"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.pod.phase", + metricAlias, + resourceScope: KubernetesResourceScope.Cluster, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Sum, + attributes: { "k8s.pod.phase": "Pending" }, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + }); + }, +}; + +const nodeNotReadyTemplate: KubernetesAlertTemplate = { + id: "k8s-node-not-ready", + name: "Node Not Ready", + description: + "Alert when a node condition transitions to NotReady, indicating node health issues.", + category: "Node", + severity: "Critical", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "node_ready"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.node.condition_ready", + metricAlias, + resourceScope: KubernetesResourceScope.Node, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Min, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + }); + }, +}; + +const highCpuTemplate: KubernetesAlertTemplate = { + id: "k8s-high-cpu", + name: "High Node CPU Utilization", + description: + "Alert when node CPU utilization exceeds 90% sustained.", + category: "Node", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "node_cpu"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.node.cpu.utilization", + metricAlias, + resourceScope: KubernetesResourceScope.Node, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Avg, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 90, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.LessThanOrEqualTo, + value: 90, + }), + }); + }, +}; + +const highMemoryTemplate: KubernetesAlertTemplate = { + id: "k8s-high-memory", + name: "High Node Memory Utilization", + description: + "Alert when node memory utilization exceeds 85% sustained.", + category: "Node", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "node_memory"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.node.memory.usage", + metricAlias, + resourceScope: KubernetesResourceScope.Node, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Avg, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 85, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.LessThanOrEqualTo, + value: 85, + }), + }); + }, +}; + +const deploymentReplicaMismatchTemplate: KubernetesAlertTemplate = { + id: "k8s-deployment-replica-mismatch", + name: "Deployment Replica Mismatch", + description: + "Alert when available replicas are less than desired replicas for a deployment.", + category: "Workload", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "unavailable_replicas"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.deployment.unavailable_replicas", + metricAlias, + resourceScope: KubernetesResourceScope.Workload, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Max, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + }); + }, +}; + +const jobFailuresTemplate: KubernetesAlertTemplate = { + id: "k8s-job-failures", + name: "Job Failures", + description: "Alert when Kubernetes jobs fail.", + category: "Workload", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "failed_pods"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.job.failed_pods", + metricAlias, + resourceScope: KubernetesResourceScope.Workload, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Max, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + }); + }, +}; + +const etcdNoLeaderTemplate: KubernetesAlertTemplate = { + id: "k8s-etcd-no-leader", + name: "etcd No Leader", + description: + "Alert immediately when etcd has no leader elected. This is a critical cluster health issue.", + category: "ControlPlane", + severity: "Critical", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "etcd_has_leader"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "etcd_server_has_leader", + metricAlias, + resourceScope: KubernetesResourceScope.Cluster, + rollingTime: RollingTime.Past1Minute, + aggregationType: MetricsAggregationType.Min, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + }); + }, +}; + +const apiServerThrottlingTemplate: KubernetesAlertTemplate = { + id: "k8s-apiserver-throttling", + name: "API Server Throttling", + description: + "Alert when the Kubernetes API server is dropping requests due to throttling.", + category: "ControlPlane", + severity: "Critical", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "dropped_requests"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "apiserver_dropped_requests_total", + metricAlias, + resourceScope: KubernetesResourceScope.Cluster, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Sum, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + }); + }, +}; + +const schedulerBacklogTemplate: KubernetesAlertTemplate = { + id: "k8s-scheduler-backlog", + name: "Scheduler Backlog", + description: + "Alert when there are pods waiting to be scheduled for more than 5 minutes.", + category: "Scheduling", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "pending_pods"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "scheduler_pending_pods", + metricAlias, + resourceScope: KubernetesResourceScope.Cluster, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Avg, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + }); + }, +}; + +const highDiskUsageTemplate: KubernetesAlertTemplate = { + id: "k8s-high-disk-usage", + name: "High Node Disk Usage", + description: + "Alert when node filesystem usage exceeds 90% capacity.", + category: "Storage", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "disk_usage"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.node.filesystem.usage", + metricAlias, + resourceScope: KubernetesResourceScope.Node, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Avg, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 90, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.LessThanOrEqualTo, + value: 90, + }), + }); + }, +}; + +const daemonSetUnavailableTemplate: KubernetesAlertTemplate = { + id: "k8s-daemonset-unavailable", + name: "DaemonSet Unavailable Nodes", + description: + "Alert when a DaemonSet has unavailable nodes where the daemon pod should be running.", + category: "Workload", + severity: "Warning", + getMonitorStep: (args: KubernetesAlertTemplateArgs): MonitorStep => { + const metricAlias: string = "unavailable_nodes"; + + return buildKubernetesMonitorStep({ + kubernetesMonitor: buildKubernetesMonitorConfig({ + clusterIdentifier: args.clusterIdentifier, + metricName: "k8s.daemonset.misscheduled_nodes", + metricAlias, + resourceScope: KubernetesResourceScope.Workload, + rollingTime: RollingTime.Past5Minutes, + aggregationType: MetricsAggregationType.Max, + }), + offlineCriteriaInstance: buildOfflineCriteriaInstance({ + offlineMonitorStatusId: args.offlineMonitorStatusId, + incidentSeverityId: args.defaultIncidentSeverityId, + alertSeverityId: args.defaultAlertSeverityId, + monitorName: args.monitorName, + metricAlias, + filterType: FilterType.GreaterThan, + value: 0, + }), + onlineCriteriaInstance: buildOnlineCriteriaInstance({ + onlineMonitorStatusId: args.onlineMonitorStatusId, + metricAlias, + filterType: FilterType.EqualTo, + value: 0, + }), + }); + }, +}; + +export function getAllKubernetesAlertTemplates(): Array { + return [ + crashLoopBackOffTemplate, + podPendingTemplate, + nodeNotReadyTemplate, + highCpuTemplate, + highMemoryTemplate, + deploymentReplicaMismatchTemplate, + jobFailuresTemplate, + etcdNoLeaderTemplate, + apiServerThrottlingTemplate, + schedulerBacklogTemplate, + highDiskUsageTemplate, + daemonSetUnavailableTemplate, + ]; +} + +export function getKubernetesAlertTemplatesByCategory( + category: KubernetesAlertTemplateCategory, +): Array { + return getAllKubernetesAlertTemplates().filter( + (template: KubernetesAlertTemplate) => template.category === category, + ); +} + +export function getKubernetesAlertTemplateById( + id: string, +): KubernetesAlertTemplate | undefined { + return getAllKubernetesAlertTemplates().find( + (template: KubernetesAlertTemplate) => template.id === id, + ); +} diff --git a/Common/Types/Monitor/MonitorCriteriaInstance.ts b/Common/Types/Monitor/MonitorCriteriaInstance.ts index a2327421b3..06693512c7 100644 --- a/Common/Types/Monitor/MonitorCriteriaInstance.ts +++ b/Common/Types/Monitor/MonitorCriteriaInstance.ts @@ -212,6 +212,43 @@ export default class MonitorCriteriaInstance extends DatabaseProperty { return monitorCriteriaInstance; } + if (arg.monitorType === MonitorType.Kubernetes) { + const monitorCriteriaInstance: MonitorCriteriaInstance = + new MonitorCriteriaInstance(); + + monitorCriteriaInstance.data = { + id: ObjectID.generate().toString(), + monitorStatusId: arg.monitorStatusId, + filterCondition: FilterCondition.Any, + filters: [ + { + checkOn: CheckOn.MetricValue, + filterType: FilterType.GreaterThan, + + metricMonitorOptions: { + metricAggregationType: EvaluateOverTimeType.AnyValue, + metricAlias: + arg.metricOptions && + arg.metricOptions.metricAliases && + arg.metricOptions.metricAliases.length > 0 + ? arg.metricOptions.metricAliases[0] + : undefined, + }, + value: 0, + }, + ], + incidents: [], + alerts: [], + changeMonitorStatus: true, + createIncidents: false, + createAlerts: false, + name: `Check if ${arg.monitorName} is online`, + description: `This criteria checks if the ${arg.monitorName} is online`, + }; + + return monitorCriteriaInstance; + } + if (arg.monitorType === MonitorType.Traces) { const monitorCriteriaInstance: MonitorCriteriaInstance = new MonitorCriteriaInstance(); @@ -873,6 +910,55 @@ export default class MonitorCriteriaInstance extends DatabaseProperty { }; } + if (arg.monitorType === MonitorType.Kubernetes) { + monitorCriteriaInstance.data = { + id: ObjectID.generate().toString(), + monitorStatusId: arg.monitorStatusId, + filterCondition: FilterCondition.Any, + filters: [ + { + checkOn: CheckOn.MetricValue, + filterType: FilterType.EqualTo, + metricMonitorOptions: { + metricAggregationType: EvaluateOverTimeType.AnyValue, + metricAlias: + arg.metricOptions && + arg.metricOptions.metricAliases && + arg.metricOptions.metricAliases.length > 0 + ? arg.metricOptions.metricAliases[0] + : undefined, + }, + value: 0, + }, + ], + incidents: [ + { + title: `${arg.monitorName} is offline`, + description: `${arg.monitorName} is currently offline.`, + incidentSeverityId: arg.incidentSeverityId, + autoResolveIncident: true, + id: ObjectID.generate().toString(), + onCallPolicyIds: [], + }, + ], + alerts: [ + { + title: `${arg.monitorName} is offline`, + description: `${arg.monitorName} is currently offline.`, + alertSeverityId: arg.alertSeverityId, + autoResolveAlert: true, + id: ObjectID.generate().toString(), + onCallPolicyIds: [], + }, + ], + createAlerts: false, + changeMonitorStatus: true, + createIncidents: true, + name: `Check if ${arg.monitorName} is offline`, + description: `This criteria checks if the ${arg.monitorName} is offline`, + }; + } + if (arg.monitorType === MonitorType.Traces) { monitorCriteriaInstance.data = { id: ObjectID.generate().toString(), diff --git a/Common/Types/Monitor/MonitorStep.ts b/Common/Types/Monitor/MonitorStep.ts index ae6e8504e3..8c0ad13d9a 100644 --- a/Common/Types/Monitor/MonitorStep.ts +++ b/Common/Types/Monitor/MonitorStep.ts @@ -38,6 +38,9 @@ import MonitorStepDomainMonitor, { import MonitorStepExternalStatusPageMonitor, { MonitorStepExternalStatusPageMonitorUtil, } from "./MonitorStepExternalStatusPageMonitor"; +import MonitorStepKubernetesMonitor, { + MonitorStepKubernetesMonitorUtil, +} from "./MonitorStepKubernetesMonitor"; import Zod, { ZodSchema } from "../../Utils/Schema/Zod"; export interface MonitorStepType { @@ -90,6 +93,9 @@ export interface MonitorStepType { // External Status Page monitor externalStatusPageMonitor?: MonitorStepExternalStatusPageMonitor | undefined; + + // Kubernetes monitor + kubernetesMonitor?: MonitorStepKubernetesMonitor | undefined; } export default class MonitorStep extends DatabaseProperty { @@ -119,6 +125,7 @@ export default class MonitorStep extends DatabaseProperty { dnsMonitor: undefined, domainMonitor: undefined, externalStatusPageMonitor: undefined, + kubernetesMonitor: undefined, }; } @@ -153,6 +160,7 @@ export default class MonitorStep extends DatabaseProperty { dnsMonitor: undefined, domainMonitor: undefined, externalStatusPageMonitor: undefined, + kubernetesMonitor: undefined, }; return monitorStep; @@ -267,6 +275,13 @@ export default class MonitorStep extends DatabaseProperty { return this; } + public setKubernetesMonitor( + kubernetesMonitor: MonitorStepKubernetesMonitor, + ): MonitorStep { + this.data!.kubernetesMonitor = kubernetesMonitor; + return this; + } + public setCustomCode(customCode: string): MonitorStep { this.data!.customCode = customCode; return this; @@ -293,8 +308,9 @@ export default class MonitorStep extends DatabaseProperty { screenSizeTypes: undefined, browserTypes: undefined, retryCountOnError: undefined, - lgoMonitor: undefined, + logMonitor: undefined, exceptionMonitor: undefined, + kubernetesMonitor: undefined, }, }; } @@ -405,6 +421,16 @@ export default class MonitorStep extends DatabaseProperty { } } + if (monitorType === MonitorType.Kubernetes) { + if (!value.data.kubernetesMonitor) { + return "Kubernetes monitor configuration is required"; + } + + if (!value.data.kubernetesMonitor.clusterIdentifier) { + return "Kubernetes cluster is required"; + } + } + return null; } @@ -461,6 +487,11 @@ export default class MonitorStep extends DatabaseProperty { this.data.externalStatusPageMonitor, ) : undefined, + kubernetesMonitor: this.data.kubernetesMonitor + ? MonitorStepKubernetesMonitorUtil.toJSON( + this.data.kubernetesMonitor, + ) + : undefined, }, }); } @@ -575,6 +606,9 @@ export default class MonitorStep extends DatabaseProperty { externalStatusPageMonitor: json["externalStatusPageMonitor"] ? (json["externalStatusPageMonitor"] as JSONObject) : undefined, + kubernetesMonitor: json["kubernetesMonitor"] + ? (json["kubernetesMonitor"] as JSONObject) + : undefined, }) as any; return monitorStep; @@ -603,6 +637,7 @@ export default class MonitorStep extends DatabaseProperty { dnsMonitor: Zod.any().optional(), domainMonitor: Zod.any().optional(), externalStatusPageMonitor: Zod.any().optional(), + kubernetesMonitor: Zod.any().optional(), }).openapi({ type: "object", example: { diff --git a/Common/Types/Monitor/MonitorStepKubernetesMonitor.ts b/Common/Types/Monitor/MonitorStepKubernetesMonitor.ts new file mode 100644 index 0000000000..d27479b049 --- /dev/null +++ b/Common/Types/Monitor/MonitorStepKubernetesMonitor.ts @@ -0,0 +1,50 @@ +import { JSONObject } from "../JSON"; +import MetricsViewConfig from "../Metrics/MetricsViewConfig"; +import RollingTime from "../RollingTime/RollingTime"; + +export enum KubernetesResourceScope { + Cluster = "Cluster", + Namespace = "Namespace", + Workload = "Workload", + Node = "Node", + Pod = "Pod", +} + +export interface KubernetesResourceFilters { + namespace?: string | undefined; + workloadType?: string | undefined; // deployment, statefulset, daemonset, job, cronjob + workloadName?: string | undefined; + nodeName?: string | undefined; + podName?: string | undefined; +} + +export default interface MonitorStepKubernetesMonitor { + clusterIdentifier: string; + resourceScope: KubernetesResourceScope; + resourceFilters: KubernetesResourceFilters; + metricViewConfig: MetricsViewConfig; + rollingTime: RollingTime; +} + +export class MonitorStepKubernetesMonitorUtil { + public static getDefault(): MonitorStepKubernetesMonitor { + return { + clusterIdentifier: "", + resourceScope: KubernetesResourceScope.Cluster, + resourceFilters: {}, + metricViewConfig: { + queryConfigs: [], + formulaConfigs: [], + }, + rollingTime: RollingTime.Past1Minute, + }; + } + + public static fromJSON(json: JSONObject): MonitorStepKubernetesMonitor { + return json as any as MonitorStepKubernetesMonitor; + } + + public static toJSON(monitor: MonitorStepKubernetesMonitor): JSONObject { + return monitor as any as JSONObject; + } +} diff --git a/Common/Types/Monitor/MonitorType.ts b/Common/Types/Monitor/MonitorType.ts index faad5be0ec..e3dfd720f0 100644 --- a/Common/Types/Monitor/MonitorType.ts +++ b/Common/Types/Monitor/MonitorType.ts @@ -83,6 +83,10 @@ export class MonitorTypeHelper { label: "Infrastructure", monitorTypes: [MonitorType.Server, MonitorType.SNMP], }, + { + label: "Kubernetes", + monitorTypes: [MonitorType.Kubernetes], + }, { label: "Telemetry", monitorTypes: [ @@ -104,7 +108,8 @@ export class MonitorTypeHelper { monitorType === MonitorType.Logs || monitorType === MonitorType.Metrics || monitorType === MonitorType.Traces || - monitorType === MonitorType.Exceptions + monitorType === MonitorType.Exceptions || + monitorType === MonitorType.Kubernetes ); } @@ -142,15 +147,13 @@ export class MonitorTypeHelper { "This monitor type does the basic ping test of an endpoint.", icon: IconProp.Signal, }, - /* - * { - * monitorType: MonitorType.Kubernetes, - * title: 'Kubernetes', - * description: - * 'This monitor types lets you monitor Kubernetes clusters.', - * icon: IconProp.Cube, - * }, - */ + { + monitorType: MonitorType.Kubernetes, + title: "Kubernetes", + description: + "This monitor type lets you monitor Kubernetes clusters, workloads, nodes, and pods.", + icon: IconProp.Cube, + }, { monitorType: MonitorType.IP, title: "IP", @@ -334,6 +337,7 @@ export class MonitorTypeHelper { MonitorType.DNS, MonitorType.Domain, MonitorType.ExternalStatusPage, + MonitorType.Kubernetes, ]; } diff --git a/HelmChart/Public/kubernetes-agent/templates/configmap-deployment.yaml b/HelmChart/Public/kubernetes-agent/templates/configmap-deployment.yaml index 96060fdfce..0533f7ac45 100644 --- a/HelmChart/Public/kubernetes-agent/templates/configmap-deployment.yaml +++ b/HelmChart/Public/kubernetes-agent/templates/configmap-deployment.yaml @@ -69,13 +69,22 @@ data: - name: persistentvolumes mode: pull interval: {{ .Values.resourceSpecs.interval }} + - name: horizontalpodautoscalers + mode: pull + interval: {{ .Values.resourceSpecs.interval }} + group: autoscaling + - name: verticalpodautoscalers + mode: pull + interval: {{ .Values.resourceSpecs.interval }} + group: autoscaling.k8s.io {{- end }} - {{- if .Values.controlPlane.enabled }} - # Scrape control plane metrics via Prometheus endpoints + {{- if or .Values.controlPlane.enabled .Values.serviceMesh.enabled }} + # Scrape metrics via Prometheus endpoints (control plane and/or service mesh) prometheus: config: scrape_configs: + {{- if .Values.controlPlane.enabled }} - job_name: etcd scheme: https tls_config: @@ -115,6 +124,53 @@ data: - targets: - {{ . | quote }} {{- end }} + {{- end }} + {{- if and .Values.serviceMesh.enabled (eq .Values.serviceMesh.provider "istio") }} + - job_name: envoy-stats + metrics_path: /stats/prometheus + scrape_interval: {{ .Values.serviceMesh.istio.scrapeInterval }} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + action: keep + regex: istio-proxy + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:15090 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod_name + {{- end }} + {{- if and .Values.serviceMesh.enabled (eq .Values.serviceMesh.provider "linkerd") }} + - job_name: linkerd-proxy + metrics_path: /metrics + scrape_interval: {{ .Values.serviceMesh.linkerd.scrapeInterval }} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + action: keep + regex: linkerd-proxy + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: linkerd-admin + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod_name + {{- end }} {{- end }} processors: @@ -196,7 +252,7 @@ data: metrics: receivers: - k8s_cluster - {{- if .Values.controlPlane.enabled }} + {{- if or .Values.controlPlane.enabled .Values.serviceMesh.enabled }} - prometheus {{- end }} processors: diff --git a/HelmChart/Public/kubernetes-agent/templates/rbac.yaml b/HelmChart/Public/kubernetes-agent/templates/rbac.yaml index 3272d96acc..6ae42f8093 100644 --- a/HelmChart/Public/kubernetes-agent/templates/rbac.yaml +++ b/HelmChart/Public/kubernetes-agent/templates/rbac.yaml @@ -66,6 +66,24 @@ rules: resources: - events verbs: ["get", "list", "watch"] + - apiGroups: ["autoscaling.k8s.io"] + resources: + - verticalpodautoscalers + verbs: ["get", "list", "watch"] +{{- if and .Values.serviceMesh.enabled (eq .Values.serviceMesh.provider "istio") }} + - apiGroups: ["networking.istio.io"] + resources: + - virtualservices + - destinationrules + - gateways + - serviceentries + verbs: ["get", "list", "watch"] + - apiGroups: ["security.istio.io"] + resources: + - peerauthentications + - authorizationpolicies + verbs: ["get", "list", "watch"] +{{- end }} # For kubeletstats receiver - nonResourceURLs: - /metrics diff --git a/HelmChart/Public/kubernetes-agent/values.yaml b/HelmChart/Public/kubernetes-agent/values.yaml index 691915d72c..d5c62be6e5 100644 --- a/HelmChart/Public/kubernetes-agent/values.yaml +++ b/HelmChart/Public/kubernetes-agent/values.yaml @@ -80,6 +80,16 @@ resourceSpecs: # Collection intervals collectionInterval: 30s +# Service mesh observability (Istio / Linkerd sidecar metrics) +serviceMesh: + enabled: false + # Supported providers: "istio", "linkerd" + provider: "istio" + istio: + scrapeInterval: 15s + linkerd: + scrapeInterval: 15s + # Service account configuration serviceAccount: create: true diff --git a/Worker/Jobs/TelemetryMonitor/MonitorTelemetryMonitor.ts b/Worker/Jobs/TelemetryMonitor/MonitorTelemetryMonitor.ts index b7ba622f60..981cd57c8b 100644 --- a/Worker/Jobs/TelemetryMonitor/MonitorTelemetryMonitor.ts +++ b/Worker/Jobs/TelemetryMonitor/MonitorTelemetryMonitor.ts @@ -42,6 +42,7 @@ import MonitorStepExceptionMonitor, { } from "Common/Types/Monitor/MonitorStepExceptionMonitor"; import ExceptionInstanceService from "Common/Server/Services/ExceptionInstanceService"; import ExceptionInstance from "Common/Models/AnalyticsModels/ExceptionInstance"; +import MonitorStepKubernetesMonitor from "Common/Types/Monitor/MonitorStepKubernetesMonitor"; RunCron( "TelemetryMonitor:MonitorTelemetryMonitor", @@ -60,6 +61,7 @@ RunCron( MonitorType.Traces, MonitorType.Metrics, MonitorType.Exceptions, + MonitorType.Kubernetes, ]), telemetryMonitorNextMonitorAt: DatabaseQueryHelper.lessThanEqualToOrNull( @@ -224,6 +226,14 @@ const monitorTelemetryMonitor: MonitorTelemetryMonitorFunction = async (data: { }); } + if (monitorType === MonitorType.Kubernetes) { + return monitorKubernetes({ + monitorStep, + monitorId, + projectId, + }); + } + throw new BadDataException("Monitor type is not supported"); }; @@ -392,6 +402,125 @@ const monitorException: MonitorExceptionFunction = async (data: { }; }; +type MonitorKubernetesFunction = (data: { + monitorStep: MonitorStep; + monitorId: ObjectID; + projectId: ObjectID; +}) => Promise; + +const monitorKubernetes: MonitorKubernetesFunction = async (data: { + monitorStep: MonitorStep; + monitorId: ObjectID; + projectId: ObjectID; +}): Promise => { + const kubernetesMonitorConfig: MonitorStepKubernetesMonitor | undefined = + data.monitorStep.data?.kubernetesMonitor; + + if (!kubernetesMonitorConfig) { + throw new BadDataException("Kubernetes monitor config is missing"); + } + + const startAndEndDate: InBetween = + RollingTimeUtil.convertToStartAndEndDate( + kubernetesMonitorConfig.rollingTime || RollingTime.Past1Minute, + ); + + const finalResult: Array = []; + + for (const queryConfig of kubernetesMonitorConfig.metricViewConfig + .queryConfigs) { + const query: Query = { + projectId: data.projectId, + time: startAndEndDate, + name: queryConfig.metricQueryData.filterData.metricName, + }; + + // Start with any user-defined attribute filters + const attributes: Dictionary = {}; + + if ( + queryConfig.metricQueryData && + queryConfig.metricQueryData.filterData && + queryConfig.metricQueryData.filterData.attributes && + Object.keys(queryConfig.metricQueryData.filterData.attributes).length > 0 + ) { + Object.assign( + attributes, + queryConfig.metricQueryData.filterData.attributes, + ); + } + + // Add Kubernetes-specific attribute filters + if (kubernetesMonitorConfig.clusterIdentifier) { + attributes["k8s.cluster.name"] = + kubernetesMonitorConfig.clusterIdentifier; + } + + if (kubernetesMonitorConfig.resourceFilters) { + const resourceFilters = kubernetesMonitorConfig.resourceFilters; + + if (resourceFilters.namespace) { + attributes["k8s.namespace.name"] = resourceFilters.namespace; + } + + if (resourceFilters.nodeName) { + attributes["k8s.node.name"] = resourceFilters.nodeName; + } + + if (resourceFilters.podName) { + attributes["k8s.pod.name"] = resourceFilters.podName; + } + + if (resourceFilters.workloadName && resourceFilters.workloadType) { + const workloadType: string = + resourceFilters.workloadType.toLowerCase(); + attributes[`k8s.${workloadType}.name`] = + resourceFilters.workloadName; + } + } + + if (Object.keys(attributes).length > 0) { + query.attributes = attributes; + } + + const aggregatedResults: AggregatedResult = + await MetricService.aggregateBy({ + query: query, + aggregationType: + (queryConfig.metricQueryData.filterData + .aggegationType as MetricsAggregationType) || + MetricsAggregationType.Avg, + aggregateColumnName: "value", + aggregationTimestampColumnName: "time", + startTimestamp: + (startAndEndDate?.startValue as Date) || + OneUptimeDate.getCurrentDate(), + endTimestamp: + (startAndEndDate?.endValue as Date) || + OneUptimeDate.getCurrentDate(), + limit: LIMIT_PER_PROJECT, + skip: 0, + groupBy: queryConfig.metricQueryData.groupBy, + props: { + isRoot: true, + }, + }); + + logger.debug("Kubernetes monitor aggregated results"); + logger.debug(aggregatedResults); + + finalResult.push(aggregatedResults); + } + + return { + projectId: data.projectId, + metricViewConfig: kubernetesMonitorConfig.metricViewConfig, + startAndEndDate: startAndEndDate, + metricResult: finalResult, + monitorId: data.monitorId, + }; +}; + type MonitorLogsFunction = (data: { monitorStep: MonitorStep; monitorId: ObjectID;