From 5cb48400a2e8105203be53ca5cb0d30413b8b352 Mon Sep 17 00:00:00 2001 From: Nawaz Dhandala Date: Wed, 25 Mar 2026 13:19:33 +0000 Subject: [PATCH] feat: enhance Kubernetes service mesh metrics view with Istio and Linkerd support - Introduced new metric query specifications for Istio and Linkerd. - Refactored metric data fetching to utilize a unified query builder. - Added a global time range picker for metrics visualization. - Created reusable MeshSection component for displaying metrics. - Updated UI to present Istio and Linkerd metrics in organized sections. --- .../Pages/Kubernetes/View/ControlPlane.tsx | 763 +++++++++++----- .../src/Pages/Kubernetes/View/ServiceMesh.tsx | 858 ++++++++++++++---- 2 files changed, 1193 insertions(+), 428 deletions(-) diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ControlPlane.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ControlPlane.tsx index 3726d4cf74..5499852aa5 100644 --- a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ControlPlane.tsx +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ControlPlane.tsx @@ -7,12 +7,13 @@ import MetricView from "../../../Components/Metrics/MetricView"; import MetricViewData from "Common/Types/Metrics/MetricViewData"; import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData"; import AggregationType from "Common/Types/BaseDatabase/AggregationType"; -import OneUptimeDate from "Common/Types/Date"; -import InBetween from "Common/Types/BaseDatabase/InBetween"; +import IconProp from "Common/Types/Icon/IconProp"; +import Icon from "Common/UI/Components/Icon/Icon"; import React, { Fragment, FunctionComponent, ReactElement, + useCallback, useEffect, useState, } from "react"; @@ -22,6 +23,435 @@ import PageLoader from "Common/UI/Components/Loader/PageLoader"; import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils"; +import RangeStartAndEndDateTime, { + RangeStartAndEndDateTimeUtil, +} from "Common/Types/Time/RangeStartAndEndDateTime"; +import TimeRange from "Common/Types/Time/TimeRange"; +import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView"; +import InBetween from "Common/Types/BaseDatabase/InBetween"; + +// ────────────────────────────────────────────────────────────────────────────── +// Query builder helper +// ────────────────────────────────────────────────────────────────────────────── + +interface MetricQuerySpec { + variable: string; + title: string; + description: string; + legend: string; + legendUnit: string; + metricName: string; + aggregation: AggregationType; + yAxisFormatter?: (value: number) => string; +} + +function buildQuery( + spec: MetricQuerySpec, + clusterIdentifier: string, +): MetricQueryConfigData { + return { + metricAliasData: { + metricVariable: spec.variable, + title: spec.title, + description: spec.description, + legend: spec.legend, + legendUnit: spec.legendUnit, + }, + metricQueryData: { + filterData: { + metricName: spec.metricName, + attributes: { + "resource.k8s.cluster.name": clusterIdentifier, + }, + aggegationType: spec.aggregation, + aggregateBy: {}, + }, + groupBy: { + attributes: true, + }, + }, + yAxisValueFormatter: spec.yAxisFormatter, + }; +} + +function buildMetricViewData( + queries: Array, + startAndEndDate: InBetween, +): MetricViewData { + return { + startAndEndDate, + queryConfigs: queries, + formulaConfigs: [], + }; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Section component — one Card per control plane component +// ────────────────────────────────────────────────────────────────────────────── + +interface SectionProps { + title: string; + description: string; + icon: IconProp; + data: MetricViewData; +} + +const ControlPlaneSection: FunctionComponent = ( + props: SectionProps, +): ReactElement => { + return ( + + + {props.title} + + } + description={props.description} + > + {}} + /> + + ); +}; + +// ────────────────────────────────────────────────────────────────────────────── +// Metric specs per control plane component +// ────────────────────────────────────────────────────────────────────────────── + +function getEtcdQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "etcd_db_size", + title: "Database Size", + description: "Total size of the etcd MVCC database on disk.", + legend: "DB Size", + legendUnit: "", + metricName: "etcd_mvcc_db_total_size_in_bytes", + aggregation: AggregationType.Avg, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_db_size_in_use", + title: "Database Size In Use", + description: "Actual used size of the etcd database (after compaction).", + legend: "In Use", + legendUnit: "", + metricName: "etcd_mvcc_db_total_size_in_use_in_bytes", + aggregation: AggregationType.Avg, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_keys_total", + title: "Total Keys", + description: "Total number of keys stored in etcd.", + legend: "Keys", + legendUnit: "", + metricName: "etcd_debugging_mvcc_keys_total", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_leader_changes", + title: "Leader Changes", + description: + "Number of leader changes seen. Frequent changes indicate instability.", + legend: "Changes", + legendUnit: "", + metricName: "etcd_server_leader_changes_seen_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_proposals_failed", + title: "Failed Proposals", + description: "Total number of failed raft proposals.", + legend: "Failed", + legendUnit: "", + metricName: "etcd_server_proposals_failed_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_disk_wal_fsync", + title: "WAL Fsync Duration", + description: + "Latency of WAL fsync operations. High values cause slow commits.", + legend: "Duration", + legendUnit: "seconds", + metricName: "etcd_disk_wal_fsync_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_disk_backend_commit", + title: "Backend Commit Duration", + description: "Latency of backend commit operations.", + legend: "Duration", + legendUnit: "seconds", + metricName: "etcd_disk_backend_commit_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "etcd_network_peer_sent", + title: "Network Peer Sent Bytes", + description: "Total bytes sent to peers. Shows replication traffic.", + legend: "Sent", + legendUnit: "", + metricName: "etcd_network_peer_sent_bytes_total", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + ]; +} + +function getApiServerQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "apiserver_requests", + title: "Request Rate", + description: "Total API server requests.", + legend: "Requests", + legendUnit: "req/s", + metricName: "apiserver_request_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "apiserver_latency", + title: "Request Latency", + description: "Average API server request duration.", + legend: "Latency", + legendUnit: "seconds", + metricName: "apiserver_request_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "apiserver_errors", + title: "Request Errors (5xx)", + description: + "API server 5xx error count. Non-zero values need investigation.", + legend: "Errors", + legendUnit: "", + metricName: "apiserver_request_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "apiserver_inflight_requests", + title: "In-Flight Requests", + description: + "Current number of in-flight requests being processed. High counts indicate API server saturation.", + legend: "Requests", + legendUnit: "", + metricName: "apiserver_current_inflight_requests", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "apiserver_audit_events", + title: "Audit Events", + description: + "Number of audit events processed. Monitors audit pipeline throughput.", + legend: "Events", + legendUnit: "", + metricName: "apiserver_audit_event_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "apiserver_tls_handshake_errors", + title: "TLS Handshake Errors", + description: + "Number of TLS handshake errors. Indicates certificate or connectivity issues.", + legend: "Errors", + legendUnit: "", + metricName: "apiserver_tls_handshake_errors_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +function getSchedulerQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "scheduler_pending", + title: "Pending Pods", + description: + "Number of pods waiting to be scheduled. Sustained values indicate scheduling pressure.", + legend: "Pending Pods", + legendUnit: "", + metricName: "scheduler_pending_pods", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "scheduler_latency", + title: "Scheduling Latency", + description: "End-to-end scheduling duration from pod creation to binding.", + legend: "Latency", + legendUnit: "seconds", + metricName: "scheduler_e2e_scheduling_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "scheduler_attempts", + title: "Scheduling Attempts", + description: "Number of scheduling attempts by result (scheduled, unschedulable, error).", + legend: "Attempts", + legendUnit: "", + metricName: "scheduler_schedule_attempts_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "scheduler_preemptions", + title: "Preemption Attempts", + description: "Number of preemption attempts to free resources for higher-priority pods.", + legend: "Preemptions", + legendUnit: "", + metricName: "scheduler_preemption_attempts_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "scheduler_queue_duration", + title: "Queue Wait Duration", + description: "Time pods spend in the scheduling queue before being scheduled.", + legend: "Duration", + legendUnit: "seconds", + metricName: "scheduler_scheduling_attempt_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + ]; +} + +function getControllerManagerQueries( + cluster: string, +): Array { + return [ + buildQuery( + { + variable: "controller_queue_depth", + title: "Work Queue Depth", + description: + "Current depth of controller work queues. High values mean controllers are falling behind.", + legend: "Queue Depth", + legendUnit: "", + metricName: "workqueue_depth", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "controller_queue_latency", + title: "Queue Latency", + description: "Time items spend waiting in the work queue before processing.", + legend: "Latency", + legendUnit: "seconds", + metricName: "workqueue_queue_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "controller_work_duration", + title: "Work Duration", + description: "Time spent processing a single work queue item.", + legend: "Duration", + legendUnit: "seconds", + metricName: "workqueue_work_duration_seconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "controller_retries", + title: "Retries", + description: + "Number of work queue item retries. Frequent retries indicate reconciliation failures.", + legend: "Retries", + legendUnit: "", + metricName: "workqueue_retries_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "controller_adds", + title: "Queue Additions", + description: "Rate of items being added to work queues.", + legend: "Adds", + legendUnit: "", + metricName: "workqueue_adds_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Main component +// ────────────────────────────────────────────────────────────────────────────── const KubernetesClusterControlPlane: FunctionComponent< PageComponentProps @@ -31,14 +461,28 @@ const KubernetesClusterControlPlane: FunctionComponent< const [cluster, setCluster] = useState(null); const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(""); - const [etcdMetricViewData, setEtcdMetricViewData] = - useState(null); - const [apiServerMetricViewData, setApiServerMetricViewData] = - useState(null); - const [schedulerMetricViewData, setSchedulerMetricViewData] = - useState(null); - const [controllerMetricViewData, setControllerMetricViewData] = - useState(null); + + const [timeRange, setTimeRange] = useState({ + range: TimeRange.PAST_ONE_HOUR, + }); + + const [startAndEndDate, setStartAndEndDate] = useState>( + RangeStartAndEndDateTimeUtil.getStartAndEndDate({ + range: TimeRange.PAST_ONE_HOUR, + }), + ); + + const handleTimeRangeChange: ( + newTimeRange: RangeStartAndEndDateTime, + ) => void = useCallback( + (newTimeRange: RangeStartAndEndDateTime): void => { + setTimeRange(newTimeRange); + setStartAndEndDate( + RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange), + ); + }, + [], + ); const fetchCluster: PromiseVoidFunction = async (): Promise => { setIsLoading(true); @@ -63,180 +507,6 @@ const KubernetesClusterControlPlane: FunctionComponent< }); }, []); - useEffect(() => { - if (!cluster) { - return; - } - - const clusterIdentifier: string = cluster.clusterIdentifier || ""; - const endDate: Date = OneUptimeDate.getCurrentDate(); - const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6); - const startAndEndDate: InBetween = new InBetween(startDate, endDate); - - const etcdDbSizeQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "etcd_db_size", - title: "etcd Database Size", - description: "Total size of the etcd database", - legend: "DB Size", - legendUnit: "", - }, - metricQueryData: { - filterData: { - metricName: "etcd_mvcc_db_total_size_in_bytes", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - yAxisValueFormatter: KubernetesResourceUtils.formatBytesForChart, - }; - - const apiServerRequestRateQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "apiserver_requests", - title: "API Server Request Rate", - description: "Total API server requests by verb", - legend: "Requests", - legendUnit: "req/s", - }, - metricQueryData: { - filterData: { - metricName: "apiserver_request_total", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Sum, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - const apiServerLatencyQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "apiserver_latency", - title: "API Server Request Latency", - description: "API server request duration", - legend: "Latency", - legendUnit: "seconds", - }, - metricQueryData: { - filterData: { - metricName: "apiserver_request_duration_seconds", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - const schedulerPendingQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "scheduler_pending", - title: "Scheduler Pending Pods", - description: "Number of pods pending scheduling", - legend: "Pending Pods", - legendUnit: "", - }, - metricQueryData: { - filterData: { - metricName: "scheduler_pending_pods", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - const schedulerLatencyQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "scheduler_latency", - title: "Scheduler Latency", - description: "End-to-end scheduling latency", - legend: "Latency", - legendUnit: "seconds", - }, - metricQueryData: { - filterData: { - metricName: "scheduler_e2e_scheduling_duration_seconds", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - const controllerQueueDepthQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "controller_queue", - title: "Controller Manager Queue Depth", - description: "Work queue depth for controller manager", - legend: "Queue Depth", - legendUnit: "", - }, - metricQueryData: { - filterData: { - metricName: "workqueue_depth", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - setEtcdMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [etcdDbSizeQuery], - formulaConfigs: [], - }); - - setApiServerMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [apiServerRequestRateQuery, apiServerLatencyQuery], - formulaConfigs: [], - }); - - setSchedulerMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [schedulerPendingQuery, schedulerLatencyQuery], - formulaConfigs: [], - }); - - setControllerMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [controllerQueueDepthQuery], - formulaConfigs: [], - }); - }, [cluster]); - if (isLoading) { return ; } @@ -245,70 +515,95 @@ const KubernetesClusterControlPlane: FunctionComponent< return ; } - if ( - !cluster || - !etcdMetricViewData || - !apiServerMetricViewData || - !schedulerMetricViewData || - !controllerMetricViewData - ) { + if (!cluster) { return ; } + const clusterIdentifier: string = cluster.clusterIdentifier || ""; + + // Build all metric view data sections + const etcdData: MetricViewData = buildMetricViewData( + getEtcdQueries(clusterIdentifier), + startAndEndDate, + ); + const apiServerData: MetricViewData = buildMetricViewData( + getApiServerQueries(clusterIdentifier), + startAndEndDate, + ); + const schedulerData: MetricViewData = buildMetricViewData( + getSchedulerQueries(clusterIdentifier), + startAndEndDate, + ); + const controllerData: MetricViewData = buildMetricViewData( + getControllerManagerQueries(clusterIdentifier), + startAndEndDate, + ); + return ( -
-

- Control plane metrics require the controlPlane.enabled{" "} - flag to be set to true in the kubernetes-agent Helm chart - values. This is typically only available for self-managed Kubernetes - clusters, not managed services like EKS, GKE, or AKS. -

+ {/* Info banner */} +
+
+ +
+
+

+ Control Plane Metrics Configuration +

+

+ Control plane metrics require{" "} + + controlPlane.enabled: true + {" "} + in the kubernetes-agent Helm chart values. This is typically only + available for self-managed clusters, not managed services like EKS, + GKE, or AKS. +

+
- + +
+ + {/* etcd */} + - {}} - /> - + description="Distributed key-value store backing all cluster state. Monitors database size, disk I/O latency, leader stability, and replication health." + icon={IconProp.Database} + data={etcdData} + /> - - {}} - /> - + description="Central management entity that validates and serves all REST operations. Tracks request throughput, latency, error rates, and connection health." + icon={IconProp.Globe} + data={apiServerData} + /> - - {}} - /> - + description="Assigns pods to nodes based on resource requirements, affinity, and constraints. Monitors scheduling throughput, queue pressure, and preemption activity." + icon={IconProp.AdjustmentHorizontal} + data={schedulerData} + /> - - {}} - /> - + description="Runs core control loops that reconcile cluster state. Tracks work queue depth, processing latency, retries, and throughput across all controllers." + icon={IconProp.Settings} + data={controllerData} + />
); }; diff --git a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx index 744f66eeae..6156dde43a 100644 --- a/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx +++ b/App/FeatureSet/Dashboard/src/Pages/Kubernetes/View/ServiceMesh.tsx @@ -7,12 +7,14 @@ import MetricView from "../../../Components/Metrics/MetricView"; import MetricViewData from "Common/Types/Metrics/MetricViewData"; import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData"; import AggregationType from "Common/Types/BaseDatabase/AggregationType"; -import OneUptimeDate from "Common/Types/Date"; import InBetween from "Common/Types/BaseDatabase/InBetween"; +import IconProp from "Common/Types/Icon/IconProp"; +import Icon from "Common/UI/Components/Icon/Icon"; import React, { Fragment, FunctionComponent, ReactElement, + useCallback, useEffect, useState, } from "react"; @@ -21,6 +23,536 @@ import API from "Common/UI/Utils/API/API"; import PageLoader from "Common/UI/Components/Loader/PageLoader"; import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage"; import { PromiseVoidFunction } from "Common/Types/FunctionTypes"; +import RangeStartAndEndDateTime, { + RangeStartAndEndDateTimeUtil, +} from "Common/Types/Time/RangeStartAndEndDateTime"; +import TimeRange from "Common/Types/Time/TimeRange"; +import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView"; +import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils"; + +// ────────────────────────────────────────────────────────────────────────────── +// Query builder helper +// ────────────────────────────────────────────────────────────────────────────── + +interface MetricQuerySpec { + variable: string; + title: string; + description: string; + legend: string; + legendUnit: string; + metricName: string; + aggregation: AggregationType; + yAxisFormatter?: (value: number) => string; +} + +function buildQuery( + spec: MetricQuerySpec, + clusterIdentifier: string, +): MetricQueryConfigData { + return { + metricAliasData: { + metricVariable: spec.variable, + title: spec.title, + description: spec.description, + legend: spec.legend, + legendUnit: spec.legendUnit, + }, + metricQueryData: { + filterData: { + metricName: spec.metricName, + attributes: { + "resource.k8s.cluster.name": clusterIdentifier, + }, + aggegationType: spec.aggregation, + aggregateBy: {}, + }, + groupBy: { + attributes: true, + }, + }, + yAxisValueFormatter: spec.yAxisFormatter, + }; +} + +function buildMetricViewData( + queries: Array, + startAndEndDate: InBetween, +): MetricViewData { + return { + startAndEndDate, + queryConfigs: queries, + formulaConfigs: [], + }; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Section component +// ────────────────────────────────────────────────────────────────────────────── + +interface SectionProps { + title: string; + description: string; + icon: IconProp; + data: MetricViewData; +} + +const MeshSection: FunctionComponent = ( + props: SectionProps, +): ReactElement => { + return ( + + + {props.title} + + } + description={props.description} + > + {}} + /> + + ); +}; + +// ────────────────────────────────────────────────────────────────────────────── +// Istio metric specs +// ────────────────────────────────────────────────────────────────────────────── + +function getIstioQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "istio_requests_total", + title: "Request Rate", + description: + "Total request throughput across all Envoy sidecars in the mesh.", + legend: "Requests", + legendUnit: "req/s", + metricName: "istio_requests_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "istio_request_duration", + title: "Request Duration", + description: + "Average request latency through the mesh. High values indicate service or network slowness.", + legend: "Duration", + legendUnit: "ms", + metricName: "istio_request_duration_milliseconds_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "istio_request_bytes", + title: "Request Size", + description: + "Size of HTTP request bodies flowing through the mesh.", + legend: "Size", + legendUnit: "", + metricName: "istio_request_bytes_sum", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "istio_response_bytes", + title: "Response Size", + description: + "Size of HTTP response bodies flowing through the mesh.", + legend: "Size", + legendUnit: "", + metricName: "istio_response_bytes_sum", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "istio_tcp_sent_bytes", + title: "TCP Bytes Sent", + description: + "Total bytes sent over TCP connections in the mesh. Includes all L4 traffic.", + legend: "Sent", + legendUnit: "", + metricName: "istio_tcp_sent_bytes_total", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "istio_tcp_received_bytes", + title: "TCP Bytes Received", + description: + "Total bytes received over TCP connections in the mesh.", + legend: "Received", + legendUnit: "", + metricName: "istio_tcp_received_bytes_total", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "istio_tcp_connections_opened", + title: "TCP Connections Opened", + description: + "Number of new TCP connections opened. Spikes may indicate connection churn.", + legend: "Opened", + legendUnit: "", + metricName: "istio_tcp_connections_opened_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "istio_tcp_connections_closed", + title: "TCP Connections Closed", + description: + "Number of TCP connections closed. Compare with opened to detect leaks.", + legend: "Closed", + legendUnit: "", + metricName: "istio_tcp_connections_closed_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Istio Pilot / Control Plane metric specs +// ────────────────────────────────────────────────────────────────────────────── + +function getIstioPilotQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "pilot_xds_pushes", + title: "xDS Config Pushes", + description: + "Number of xDS configuration pushes to Envoy sidecars. High rates mean frequent config changes.", + legend: "Pushes", + legendUnit: "", + metricName: "pilot_xds_pushes", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "pilot_xds_push_errors", + title: "xDS Push Errors", + description: + "Configuration push failures to sidecars. Non-zero values indicate connectivity or config issues.", + legend: "Errors", + legendUnit: "", + metricName: "pilot_total_xds_internal_errors", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "pilot_proxy_convergence", + title: "Proxy Convergence Time", + description: + "Time for configuration changes to propagate to all Envoy proxies.", + legend: "Duration", + legendUnit: "seconds", + metricName: "pilot_proxy_convergence_time_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "pilot_xds_connected", + title: "Connected Proxies", + description: + "Number of Envoy proxies currently connected to Pilot (istiod).", + legend: "Proxies", + legendUnit: "", + metricName: "pilot_xds", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "pilot_conflict_inbound", + title: "Inbound Listener Conflicts", + description: + "Number of inbound listener conflicts. Indicates overlapping port configurations.", + legend: "Conflicts", + legendUnit: "", + metricName: "pilot_conflict_inbound_listener", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "pilot_conflict_outbound", + title: "Outbound Listener Conflicts", + description: + "Number of outbound listener conflicts between services.", + legend: "Conflicts", + legendUnit: "", + metricName: "pilot_conflict_outbound_listener_tcp_over_current_tcp", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Envoy Proxy metric specs +// ────────────────────────────────────────────────────────────────────────────── + +function getEnvoyQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "envoy_cluster_upstream_cx_total", + title: "Upstream Connections", + description: + "Total upstream connections initiated by Envoy proxies to backend services.", + legend: "Connections", + legendUnit: "", + metricName: "envoy_cluster_upstream_cx_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "envoy_cluster_upstream_cx_active", + title: "Active Upstream Connections", + description: + "Currently active upstream connections. Shows real-time connection pool utilization.", + legend: "Active", + legendUnit: "", + metricName: "envoy_cluster_upstream_cx_active", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "envoy_cluster_upstream_rq_timeout", + title: "Upstream Request Timeouts", + description: + "Requests that timed out waiting for an upstream response. Indicates slow or unresponsive services.", + legend: "Timeouts", + legendUnit: "", + metricName: "envoy_cluster_upstream_rq_timeout", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "envoy_cluster_upstream_rq_retry", + title: "Upstream Retries", + description: + "Number of request retries to upstream services. High retries indicate flaky backends.", + legend: "Retries", + legendUnit: "", + metricName: "envoy_cluster_upstream_rq_retry", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "envoy_cluster_upstream_cx_connect_fail", + title: "Connection Failures", + description: + "Failed upstream connection attempts. Indicates network or service availability issues.", + legend: "Failures", + legendUnit: "", + metricName: "envoy_cluster_upstream_cx_connect_fail", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Linkerd metric specs +// ────────────────────────────────────────────────────────────────────────────── + +function getLinkerdQueries(cluster: string): Array { + return [ + buildQuery( + { + variable: "linkerd_request_total", + title: "Request Rate", + description: + "Total request throughput across all Linkerd proxy sidecars.", + legend: "Requests", + legendUnit: "req/s", + metricName: "request_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_response_latency", + title: "Response Latency", + description: + "End-to-end response latency measured at the Linkerd proxy.", + legend: "Latency", + legendUnit: "ms", + metricName: "response_latency_ms_sum", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_tcp_open_connections", + title: "Open TCP Connections", + description: + "Number of currently open TCP connections managed by Linkerd proxies.", + legend: "Connections", + legendUnit: "", + metricName: "tcp_open_connections", + aggregation: AggregationType.Avg, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_tcp_read_bytes", + title: "TCP Read Bytes", + description: + "Total bytes read from TCP connections by Linkerd proxies.", + legend: "Read", + legendUnit: "", + metricName: "tcp_read_bytes_total", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_tcp_write_bytes", + title: "TCP Write Bytes", + description: + "Total bytes written to TCP connections by Linkerd proxies.", + legend: "Written", + legendUnit: "", + metricName: "tcp_write_bytes_total", + aggregation: AggregationType.Sum, + yAxisFormatter: KubernetesResourceUtils.formatBytesForChart, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_tcp_open_total", + title: "TCP Connections Opened", + description: + "Total TCP connections opened over time by Linkerd proxies.", + legend: "Opened", + legendUnit: "", + metricName: "tcp_open_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_tcp_close_total", + title: "TCP Connections Closed", + description: + "Total TCP connections closed. Compare with opened to detect leaks.", + legend: "Closed", + legendUnit: "", + metricName: "tcp_close_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Linkerd Control Plane metric specs +// ────────────────────────────────────────────────────────────────────────────── + +function getLinkerdControlPlaneQueries( + cluster: string, +): Array { + return [ + buildQuery( + { + variable: "linkerd_identity_certs_issued", + title: "mTLS Certificates Issued", + description: + "Number of mTLS identity certificates issued by the Linkerd identity service.", + legend: "Certificates", + legendUnit: "", + metricName: "identity_cert_rotation_count", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_destination_gets", + title: "Destination Lookups", + description: + "Service discovery lookups to the destination controller. Shows mesh routing activity.", + legend: "Lookups", + legendUnit: "", + metricName: "destination_get_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + buildQuery( + { + variable: "linkerd_proxy_injector", + title: "Proxy Injections", + description: + "Number of proxy sidecar injections performed by the webhook.", + legend: "Injections", + legendUnit: "", + metricName: "proxy_injector_injection_total", + aggregation: AggregationType.Sum, + }, + cluster, + ), + ]; +} + +// ────────────────────────────────────────────────────────────────────────────── +// Main component +// ────────────────────────────────────────────────────────────────────────────── const KubernetesClusterServiceMesh: FunctionComponent< PageComponentProps @@ -30,14 +562,28 @@ const KubernetesClusterServiceMesh: FunctionComponent< const [cluster, setCluster] = useState(null); const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(""); - const [istioRequestsMetricViewData, setIstioRequestsMetricViewData] = - useState(null); - const [istioLatencyMetricViewData, setIstioLatencyMetricViewData] = - useState(null); - const [linkerdRequestsMetricViewData, setLinkerdRequestsMetricViewData] = - useState(null); - const [linkerdLatencyMetricViewData, setLinkerdLatencyMetricViewData] = - useState(null); + + const [timeRange, setTimeRange] = useState({ + range: TimeRange.PAST_ONE_HOUR, + }); + + const [startAndEndDate, setStartAndEndDate] = useState>( + RangeStartAndEndDateTimeUtil.getStartAndEndDate({ + range: TimeRange.PAST_ONE_HOUR, + }), + ); + + const handleTimeRangeChange: ( + newTimeRange: RangeStartAndEndDateTime, + ) => void = useCallback( + (newTimeRange: RangeStartAndEndDateTime): void => { + setTimeRange(newTimeRange); + setStartAndEndDate( + RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange), + ); + }, + [], + ); const fetchCluster: PromiseVoidFunction = async (): Promise => { setIsLoading(true); @@ -62,137 +608,6 @@ const KubernetesClusterServiceMesh: FunctionComponent< }); }, []); - useEffect(() => { - if (!cluster) { - return; - } - - const clusterIdentifier: string = cluster.clusterIdentifier || ""; - const endDate: Date = OneUptimeDate.getCurrentDate(); - const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6); - const startAndEndDate: InBetween = new InBetween(startDate, endDate); - - // Istio metrics - const istioRequestsTotalQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "istio_requests_total", - title: "Istio Request Rate", - description: "Total requests through the Istio service mesh", - legend: "Requests", - legendUnit: "req/s", - }, - metricQueryData: { - filterData: { - metricName: "istio_requests_total", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Sum, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - const istioRequestDurationQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "istio_request_duration", - title: "Istio Request Latency", - description: - "Request duration through the Istio service mesh (p50/p99)", - legend: "Latency", - legendUnit: "ms", - }, - metricQueryData: { - filterData: { - metricName: "istio_request_duration_milliseconds_bucket", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - // Linkerd metrics - const linkerdRequestTotalQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "linkerd_request_total", - title: "Linkerd Request Rate", - description: "Total requests through the Linkerd service mesh", - legend: "Requests", - legendUnit: "req/s", - }, - metricQueryData: { - filterData: { - metricName: "request_total", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Sum, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - const linkerdResponseLatencyQuery: MetricQueryConfigData = { - metricAliasData: { - metricVariable: "linkerd_response_latency", - title: "Linkerd Response Latency", - description: - "Response latency through the Linkerd service mesh (p50/p99)", - legend: "Latency", - legendUnit: "ms", - }, - metricQueryData: { - filterData: { - metricName: "response_latency_ms_bucket", - attributes: { - "resource.k8s.cluster.name": clusterIdentifier, - }, - aggegationType: AggregationType.Avg, - aggregateBy: {}, - }, - groupBy: { - attributes: true, - }, - }, - }; - - setIstioRequestsMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [istioRequestsTotalQuery], - formulaConfigs: [], - }); - - setIstioLatencyMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [istioRequestDurationQuery], - formulaConfigs: [], - }); - - setLinkerdRequestsMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [linkerdRequestTotalQuery], - formulaConfigs: [], - }); - - setLinkerdLatencyMetricViewData({ - startAndEndDate: startAndEndDate, - queryConfigs: [linkerdResponseLatencyQuery], - formulaConfigs: [], - }); - }, [cluster]); - if (isLoading) { return ; } @@ -201,71 +616,126 @@ const KubernetesClusterServiceMesh: FunctionComponent< return ; } - if ( - !cluster || - !istioRequestsMetricViewData || - !istioLatencyMetricViewData || - !linkerdRequestsMetricViewData || - !linkerdLatencyMetricViewData - ) { + if (!cluster) { return ; } + const clusterIdentifier: string = cluster.clusterIdentifier || ""; + + // Build all metric view data + const istioData: MetricViewData = buildMetricViewData( + getIstioQueries(clusterIdentifier), + startAndEndDate, + ); + const istioPilotData: MetricViewData = buildMetricViewData( + getIstioPilotQueries(clusterIdentifier), + startAndEndDate, + ); + const envoyData: MetricViewData = buildMetricViewData( + getEnvoyQueries(clusterIdentifier), + startAndEndDate, + ); + const linkerdData: MetricViewData = buildMetricViewData( + getLinkerdQueries(clusterIdentifier), + startAndEndDate, + ); + const linkerdControlPlaneData: MetricViewData = buildMetricViewData( + getLinkerdControlPlaneQueries(clusterIdentifier), + startAndEndDate, + ); + return ( -
-

- Service mesh metrics require the serviceMesh.enabled flag - to be set to true and the{" "} - serviceMesh.provider to be configured in the - kubernetes-agent Helm chart values. Supported providers are Istio and - Linkerd. + {/* Info banner */} +

+
+ +
+
+

+ Service Mesh Metrics Configuration +

+

+ Service mesh metrics require{" "} + + serviceMesh.enabled: true + {" "} + and{" "} + + serviceMesh.provider + {" "} + to be configured in the kubernetes-agent Helm chart values. + Supported providers are Istio and Linkerd. Only the sections + matching your provider will show data. +

+
+
+ + {/* Global time range picker */} +
+ +
+ + {/* ── Istio Sections ─────────────────────────────────────────────── */} + +
+

Istio

+

+ Metrics from Istio Envoy sidecars and Pilot (istiod) control plane.

- - {}} - /> - + {/* Istio Data Plane */} + - - {}} - /> - + {/* Istio Control Plane (Pilot / istiod) */} + - - {}} - /> - + {/* Envoy Proxy internals */} + - - {}} - /> - + {/* ── Linkerd Sections ───────────────────────────────────────────── */} + +
+

Linkerd

+

+ Metrics from Linkerd proxy sidecars and control plane components. +

+
+ + {/* Linkerd Data Plane */} + + + {/* Linkerd Control Plane */} + ); };