feat: enhance Kubernetes service mesh metrics view with Istio and Linkerd support

- Introduced new metric query specifications for Istio and Linkerd.
- Refactored metric data fetching to utilize a unified query builder.
- Added a global time range picker for metrics visualization.
- Created reusable MeshSection component for displaying metrics.
- Updated UI to present Istio and Linkerd metrics in organized sections.
This commit is contained in:
Nawaz Dhandala
2026-03-25 13:19:33 +00:00
parent 006e54535a
commit 5cb48400a2
2 changed files with 1193 additions and 428 deletions

View File

@@ -7,12 +7,13 @@ import MetricView from "../../../Components/Metrics/MetricView";
import MetricViewData from "Common/Types/Metrics/MetricViewData";
import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData";
import AggregationType from "Common/Types/BaseDatabase/AggregationType";
import OneUptimeDate from "Common/Types/Date";
import InBetween from "Common/Types/BaseDatabase/InBetween";
import IconProp from "Common/Types/Icon/IconProp";
import Icon from "Common/UI/Components/Icon/Icon";
import React, {
Fragment,
FunctionComponent,
ReactElement,
useCallback,
useEffect,
useState,
} from "react";
@@ -22,6 +23,435 @@ import PageLoader from "Common/UI/Components/Loader/PageLoader";
import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage";
import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils";
import RangeStartAndEndDateTime, {
RangeStartAndEndDateTimeUtil,
} from "Common/Types/Time/RangeStartAndEndDateTime";
import TimeRange from "Common/Types/Time/TimeRange";
import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView";
import InBetween from "Common/Types/BaseDatabase/InBetween";
// ──────────────────────────────────────────────────────────────────────────────
// Query builder helper
// ──────────────────────────────────────────────────────────────────────────────
interface MetricQuerySpec {
variable: string;
title: string;
description: string;
legend: string;
legendUnit: string;
metricName: string;
aggregation: AggregationType;
yAxisFormatter?: (value: number) => string;
}
function buildQuery(
spec: MetricQuerySpec,
clusterIdentifier: string,
): MetricQueryConfigData {
return {
metricAliasData: {
metricVariable: spec.variable,
title: spec.title,
description: spec.description,
legend: spec.legend,
legendUnit: spec.legendUnit,
},
metricQueryData: {
filterData: {
metricName: spec.metricName,
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: spec.aggregation,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
yAxisValueFormatter: spec.yAxisFormatter,
};
}
function buildMetricViewData(
queries: Array<MetricQueryConfigData>,
startAndEndDate: InBetween<Date>,
): MetricViewData {
return {
startAndEndDate,
queryConfigs: queries,
formulaConfigs: [],
};
}
// ──────────────────────────────────────────────────────────────────────────────
// Section component — one Card per control plane component
// ──────────────────────────────────────────────────────────────────────────────
interface SectionProps {
title: string;
description: string;
icon: IconProp;
data: MetricViewData;
}
const ControlPlaneSection: FunctionComponent<SectionProps> = (
props: SectionProps,
): ReactElement => {
return (
<Card
title={
<div className="flex items-center gap-2">
<Icon
icon={props.icon}
className="h-5 w-5 text-gray-500"
/>
<span>{props.title}</span>
</div>
}
description={props.description}
>
<MetricView
data={props.data}
hideQueryElements={true}
hideStartAndEndDate={true}
hideCardInCharts={true}
onChange={() => {}}
/>
</Card>
);
};
// ──────────────────────────────────────────────────────────────────────────────
// Metric specs per control plane component
// ──────────────────────────────────────────────────────────────────────────────
function getEtcdQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "etcd_db_size",
title: "Database Size",
description: "Total size of the etcd MVCC database on disk.",
legend: "DB Size",
legendUnit: "",
metricName: "etcd_mvcc_db_total_size_in_bytes",
aggregation: AggregationType.Avg,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "etcd_db_size_in_use",
title: "Database Size In Use",
description: "Actual used size of the etcd database (after compaction).",
legend: "In Use",
legendUnit: "",
metricName: "etcd_mvcc_db_total_size_in_use_in_bytes",
aggregation: AggregationType.Avg,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "etcd_keys_total",
title: "Total Keys",
description: "Total number of keys stored in etcd.",
legend: "Keys",
legendUnit: "",
metricName: "etcd_debugging_mvcc_keys_total",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "etcd_leader_changes",
title: "Leader Changes",
description:
"Number of leader changes seen. Frequent changes indicate instability.",
legend: "Changes",
legendUnit: "",
metricName: "etcd_server_leader_changes_seen_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "etcd_proposals_failed",
title: "Failed Proposals",
description: "Total number of failed raft proposals.",
legend: "Failed",
legendUnit: "",
metricName: "etcd_server_proposals_failed_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "etcd_disk_wal_fsync",
title: "WAL Fsync Duration",
description:
"Latency of WAL fsync operations. High values cause slow commits.",
legend: "Duration",
legendUnit: "seconds",
metricName: "etcd_disk_wal_fsync_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "etcd_disk_backend_commit",
title: "Backend Commit Duration",
description: "Latency of backend commit operations.",
legend: "Duration",
legendUnit: "seconds",
metricName: "etcd_disk_backend_commit_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "etcd_network_peer_sent",
title: "Network Peer Sent Bytes",
description: "Total bytes sent to peers. Shows replication traffic.",
legend: "Sent",
legendUnit: "",
metricName: "etcd_network_peer_sent_bytes_total",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
];
}
function getApiServerQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "apiserver_requests",
title: "Request Rate",
description: "Total API server requests.",
legend: "Requests",
legendUnit: "req/s",
metricName: "apiserver_request_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "apiserver_latency",
title: "Request Latency",
description: "Average API server request duration.",
legend: "Latency",
legendUnit: "seconds",
metricName: "apiserver_request_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "apiserver_errors",
title: "Request Errors (5xx)",
description:
"API server 5xx error count. Non-zero values need investigation.",
legend: "Errors",
legendUnit: "",
metricName: "apiserver_request_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "apiserver_inflight_requests",
title: "In-Flight Requests",
description:
"Current number of in-flight requests being processed. High counts indicate API server saturation.",
legend: "Requests",
legendUnit: "",
metricName: "apiserver_current_inflight_requests",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "apiserver_audit_events",
title: "Audit Events",
description:
"Number of audit events processed. Monitors audit pipeline throughput.",
legend: "Events",
legendUnit: "",
metricName: "apiserver_audit_event_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "apiserver_tls_handshake_errors",
title: "TLS Handshake Errors",
description:
"Number of TLS handshake errors. Indicates certificate or connectivity issues.",
legend: "Errors",
legendUnit: "",
metricName: "apiserver_tls_handshake_errors_total",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
function getSchedulerQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "scheduler_pending",
title: "Pending Pods",
description:
"Number of pods waiting to be scheduled. Sustained values indicate scheduling pressure.",
legend: "Pending Pods",
legendUnit: "",
metricName: "scheduler_pending_pods",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "scheduler_latency",
title: "Scheduling Latency",
description: "End-to-end scheduling duration from pod creation to binding.",
legend: "Latency",
legendUnit: "seconds",
metricName: "scheduler_e2e_scheduling_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "scheduler_attempts",
title: "Scheduling Attempts",
description: "Number of scheduling attempts by result (scheduled, unschedulable, error).",
legend: "Attempts",
legendUnit: "",
metricName: "scheduler_schedule_attempts_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "scheduler_preemptions",
title: "Preemption Attempts",
description: "Number of preemption attempts to free resources for higher-priority pods.",
legend: "Preemptions",
legendUnit: "",
metricName: "scheduler_preemption_attempts_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "scheduler_queue_duration",
title: "Queue Wait Duration",
description: "Time pods spend in the scheduling queue before being scheduled.",
legend: "Duration",
legendUnit: "seconds",
metricName: "scheduler_scheduling_attempt_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
];
}
function getControllerManagerQueries(
cluster: string,
): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "controller_queue_depth",
title: "Work Queue Depth",
description:
"Current depth of controller work queues. High values mean controllers are falling behind.",
legend: "Queue Depth",
legendUnit: "",
metricName: "workqueue_depth",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "controller_queue_latency",
title: "Queue Latency",
description: "Time items spend waiting in the work queue before processing.",
legend: "Latency",
legendUnit: "seconds",
metricName: "workqueue_queue_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "controller_work_duration",
title: "Work Duration",
description: "Time spent processing a single work queue item.",
legend: "Duration",
legendUnit: "seconds",
metricName: "workqueue_work_duration_seconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "controller_retries",
title: "Retries",
description:
"Number of work queue item retries. Frequent retries indicate reconciliation failures.",
legend: "Retries",
legendUnit: "",
metricName: "workqueue_retries_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "controller_adds",
title: "Queue Additions",
description: "Rate of items being added to work queues.",
legend: "Adds",
legendUnit: "",
metricName: "workqueue_adds_total",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
// ──────────────────────────────────────────────────────────────────────────────
// Main component
// ──────────────────────────────────────────────────────────────────────────────
const KubernetesClusterControlPlane: FunctionComponent<
PageComponentProps
@@ -31,14 +461,28 @@ const KubernetesClusterControlPlane: FunctionComponent<
const [cluster, setCluster] = useState<KubernetesCluster | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(true);
const [error, setError] = useState<string>("");
const [etcdMetricViewData, setEtcdMetricViewData] =
useState<MetricViewData | null>(null);
const [apiServerMetricViewData, setApiServerMetricViewData] =
useState<MetricViewData | null>(null);
const [schedulerMetricViewData, setSchedulerMetricViewData] =
useState<MetricViewData | null>(null);
const [controllerMetricViewData, setControllerMetricViewData] =
useState<MetricViewData | null>(null);
const [timeRange, setTimeRange] = useState<RangeStartAndEndDateTime>({
range: TimeRange.PAST_ONE_HOUR,
});
const [startAndEndDate, setStartAndEndDate] = useState<InBetween<Date>>(
RangeStartAndEndDateTimeUtil.getStartAndEndDate({
range: TimeRange.PAST_ONE_HOUR,
}),
);
const handleTimeRangeChange: (
newTimeRange: RangeStartAndEndDateTime,
) => void = useCallback(
(newTimeRange: RangeStartAndEndDateTime): void => {
setTimeRange(newTimeRange);
setStartAndEndDate(
RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange),
);
},
[],
);
const fetchCluster: PromiseVoidFunction = async (): Promise<void> => {
setIsLoading(true);
@@ -63,180 +507,6 @@ const KubernetesClusterControlPlane: FunctionComponent<
});
}, []);
useEffect(() => {
if (!cluster) {
return;
}
const clusterIdentifier: string = cluster.clusterIdentifier || "";
const endDate: Date = OneUptimeDate.getCurrentDate();
const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6);
const startAndEndDate: InBetween<Date> = new InBetween(startDate, endDate);
const etcdDbSizeQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "etcd_db_size",
title: "etcd Database Size",
description: "Total size of the etcd database",
legend: "DB Size",
legendUnit: "",
},
metricQueryData: {
filterData: {
metricName: "etcd_mvcc_db_total_size_in_bytes",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
yAxisValueFormatter: KubernetesResourceUtils.formatBytesForChart,
};
const apiServerRequestRateQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "apiserver_requests",
title: "API Server Request Rate",
description: "Total API server requests by verb",
legend: "Requests",
legendUnit: "req/s",
},
metricQueryData: {
filterData: {
metricName: "apiserver_request_total",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Sum,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
const apiServerLatencyQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "apiserver_latency",
title: "API Server Request Latency",
description: "API server request duration",
legend: "Latency",
legendUnit: "seconds",
},
metricQueryData: {
filterData: {
metricName: "apiserver_request_duration_seconds",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
const schedulerPendingQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "scheduler_pending",
title: "Scheduler Pending Pods",
description: "Number of pods pending scheduling",
legend: "Pending Pods",
legendUnit: "",
},
metricQueryData: {
filterData: {
metricName: "scheduler_pending_pods",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
const schedulerLatencyQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "scheduler_latency",
title: "Scheduler Latency",
description: "End-to-end scheduling latency",
legend: "Latency",
legendUnit: "seconds",
},
metricQueryData: {
filterData: {
metricName: "scheduler_e2e_scheduling_duration_seconds",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
const controllerQueueDepthQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "controller_queue",
title: "Controller Manager Queue Depth",
description: "Work queue depth for controller manager",
legend: "Queue Depth",
legendUnit: "",
},
metricQueryData: {
filterData: {
metricName: "workqueue_depth",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
setEtcdMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [etcdDbSizeQuery],
formulaConfigs: [],
});
setApiServerMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [apiServerRequestRateQuery, apiServerLatencyQuery],
formulaConfigs: [],
});
setSchedulerMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [schedulerPendingQuery, schedulerLatencyQuery],
formulaConfigs: [],
});
setControllerMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [controllerQueueDepthQuery],
formulaConfigs: [],
});
}, [cluster]);
if (isLoading) {
return <PageLoader isVisible={true} />;
}
@@ -245,70 +515,95 @@ const KubernetesClusterControlPlane: FunctionComponent<
return <ErrorMessage message={error} />;
}
if (
!cluster ||
!etcdMetricViewData ||
!apiServerMetricViewData ||
!schedulerMetricViewData ||
!controllerMetricViewData
) {
if (!cluster) {
return <ErrorMessage message="Cluster not found." />;
}
const clusterIdentifier: string = cluster.clusterIdentifier || "";
// Build all metric view data sections
const etcdData: MetricViewData = buildMetricViewData(
getEtcdQueries(clusterIdentifier),
startAndEndDate,
);
const apiServerData: MetricViewData = buildMetricViewData(
getApiServerQueries(clusterIdentifier),
startAndEndDate,
);
const schedulerData: MetricViewData = buildMetricViewData(
getSchedulerQueries(clusterIdentifier),
startAndEndDate,
);
const controllerData: MetricViewData = buildMetricViewData(
getControllerManagerQueries(clusterIdentifier),
startAndEndDate,
);
return (
<Fragment>
<div className="mb-4 p-4 bg-blue-50 border border-blue-200 rounded-lg">
<p className="text-sm text-blue-700">
Control plane metrics require the <code>controlPlane.enabled</code>{" "}
flag to be set to <code>true</code> in the kubernetes-agent Helm chart
values. This is typically only available for self-managed Kubernetes
clusters, not managed services like EKS, GKE, or AKS.
</p>
{/* Info banner */}
<div className="mb-5 flex items-start gap-3 p-4 bg-blue-50 border border-blue-200 rounded-xl">
<div className="flex-shrink-0 mt-0.5">
<Icon
icon={IconProp.Info}
className="h-5 w-5 text-blue-500"
/>
</div>
<div>
<p className="text-sm font-medium text-blue-800">
Control Plane Metrics Configuration
</p>
<p className="mt-1 text-sm text-blue-600">
Control plane metrics require{" "}
<code className="px-1 py-0.5 bg-blue-100 rounded text-xs font-mono">
controlPlane.enabled: true
</code>{" "}
in the kubernetes-agent Helm chart values. This is typically only
available for self-managed clusters, not managed services like EKS,
GKE, or AKS.
</p>
</div>
</div>
<Card
{/* Global time range picker */}
<div className="mb-5 flex items-center justify-end">
<RangeStartAndEndDateView
dashboardStartAndEndDate={timeRange}
onChange={handleTimeRangeChange}
/>
</div>
{/* etcd */}
<ControlPlaneSection
title="etcd"
description="etcd is the consistent, distributed key-value store used as the backing store for all cluster data."
>
<MetricView
data={etcdMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
description="Distributed key-value store backing all cluster state. Monitors database size, disk I/O latency, leader stability, and replication health."
icon={IconProp.Database}
data={etcdData}
/>
<Card
{/* API Server */}
<ControlPlaneSection
title="API Server"
description="The Kubernetes API server validates and configures data for API objects and serves REST operations."
>
<MetricView
data={apiServerMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
description="Central management entity that validates and serves all REST operations. Tracks request throughput, latency, error rates, and connection health."
icon={IconProp.Globe}
data={apiServerData}
/>
<Card
{/* Scheduler */}
<ControlPlaneSection
title="Scheduler"
description="The scheduler watches for newly created pods that have no node assigned and selects a node for them to run on."
>
<MetricView
data={schedulerMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
description="Assigns pods to nodes based on resource requirements, affinity, and constraints. Monitors scheduling throughput, queue pressure, and preemption activity."
icon={IconProp.AdjustmentHorizontal}
data={schedulerData}
/>
<Card
{/* Controller Manager */}
<ControlPlaneSection
title="Controller Manager"
description="The controller manager runs controller processes that regulate the state of the cluster."
>
<MetricView
data={controllerMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
description="Runs core control loops that reconcile cluster state. Tracks work queue depth, processing latency, retries, and throughput across all controllers."
icon={IconProp.Settings}
data={controllerData}
/>
</Fragment>
);
};

View File

@@ -7,12 +7,14 @@ import MetricView from "../../../Components/Metrics/MetricView";
import MetricViewData from "Common/Types/Metrics/MetricViewData";
import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData";
import AggregationType from "Common/Types/BaseDatabase/AggregationType";
import OneUptimeDate from "Common/Types/Date";
import InBetween from "Common/Types/BaseDatabase/InBetween";
import IconProp from "Common/Types/Icon/IconProp";
import Icon from "Common/UI/Components/Icon/Icon";
import React, {
Fragment,
FunctionComponent,
ReactElement,
useCallback,
useEffect,
useState,
} from "react";
@@ -21,6 +23,536 @@ import API from "Common/UI/Utils/API/API";
import PageLoader from "Common/UI/Components/Loader/PageLoader";
import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage";
import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
import RangeStartAndEndDateTime, {
RangeStartAndEndDateTimeUtil,
} from "Common/Types/Time/RangeStartAndEndDateTime";
import TimeRange from "Common/Types/Time/TimeRange";
import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView";
import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils";
// ──────────────────────────────────────────────────────────────────────────────
// Query builder helper
// ──────────────────────────────────────────────────────────────────────────────
interface MetricQuerySpec {
variable: string;
title: string;
description: string;
legend: string;
legendUnit: string;
metricName: string;
aggregation: AggregationType;
yAxisFormatter?: (value: number) => string;
}
function buildQuery(
spec: MetricQuerySpec,
clusterIdentifier: string,
): MetricQueryConfigData {
return {
metricAliasData: {
metricVariable: spec.variable,
title: spec.title,
description: spec.description,
legend: spec.legend,
legendUnit: spec.legendUnit,
},
metricQueryData: {
filterData: {
metricName: spec.metricName,
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: spec.aggregation,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
yAxisValueFormatter: spec.yAxisFormatter,
};
}
function buildMetricViewData(
queries: Array<MetricQueryConfigData>,
startAndEndDate: InBetween<Date>,
): MetricViewData {
return {
startAndEndDate,
queryConfigs: queries,
formulaConfigs: [],
};
}
// ──────────────────────────────────────────────────────────────────────────────
// Section component
// ──────────────────────────────────────────────────────────────────────────────
interface SectionProps {
title: string;
description: string;
icon: IconProp;
data: MetricViewData;
}
const MeshSection: FunctionComponent<SectionProps> = (
props: SectionProps,
): ReactElement => {
return (
<Card
title={
<div className="flex items-center gap-2">
<Icon icon={props.icon} className="h-5 w-5 text-gray-500" />
<span>{props.title}</span>
</div>
}
description={props.description}
>
<MetricView
data={props.data}
hideQueryElements={true}
hideStartAndEndDate={true}
hideCardInCharts={true}
onChange={() => {}}
/>
</Card>
);
};
// ──────────────────────────────────────────────────────────────────────────────
// Istio metric specs
// ──────────────────────────────────────────────────────────────────────────────
function getIstioQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "istio_requests_total",
title: "Request Rate",
description:
"Total request throughput across all Envoy sidecars in the mesh.",
legend: "Requests",
legendUnit: "req/s",
metricName: "istio_requests_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "istio_request_duration",
title: "Request Duration",
description:
"Average request latency through the mesh. High values indicate service or network slowness.",
legend: "Duration",
legendUnit: "ms",
metricName: "istio_request_duration_milliseconds_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "istio_request_bytes",
title: "Request Size",
description:
"Size of HTTP request bodies flowing through the mesh.",
legend: "Size",
legendUnit: "",
metricName: "istio_request_bytes_sum",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "istio_response_bytes",
title: "Response Size",
description:
"Size of HTTP response bodies flowing through the mesh.",
legend: "Size",
legendUnit: "",
metricName: "istio_response_bytes_sum",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "istio_tcp_sent_bytes",
title: "TCP Bytes Sent",
description:
"Total bytes sent over TCP connections in the mesh. Includes all L4 traffic.",
legend: "Sent",
legendUnit: "",
metricName: "istio_tcp_sent_bytes_total",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "istio_tcp_received_bytes",
title: "TCP Bytes Received",
description:
"Total bytes received over TCP connections in the mesh.",
legend: "Received",
legendUnit: "",
metricName: "istio_tcp_received_bytes_total",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "istio_tcp_connections_opened",
title: "TCP Connections Opened",
description:
"Number of new TCP connections opened. Spikes may indicate connection churn.",
legend: "Opened",
legendUnit: "",
metricName: "istio_tcp_connections_opened_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "istio_tcp_connections_closed",
title: "TCP Connections Closed",
description:
"Number of TCP connections closed. Compare with opened to detect leaks.",
legend: "Closed",
legendUnit: "",
metricName: "istio_tcp_connections_closed_total",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
// ──────────────────────────────────────────────────────────────────────────────
// Istio Pilot / Control Plane metric specs
// ──────────────────────────────────────────────────────────────────────────────
function getIstioPilotQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "pilot_xds_pushes",
title: "xDS Config Pushes",
description:
"Number of xDS configuration pushes to Envoy sidecars. High rates mean frequent config changes.",
legend: "Pushes",
legendUnit: "",
metricName: "pilot_xds_pushes",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "pilot_xds_push_errors",
title: "xDS Push Errors",
description:
"Configuration push failures to sidecars. Non-zero values indicate connectivity or config issues.",
legend: "Errors",
legendUnit: "",
metricName: "pilot_total_xds_internal_errors",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "pilot_proxy_convergence",
title: "Proxy Convergence Time",
description:
"Time for configuration changes to propagate to all Envoy proxies.",
legend: "Duration",
legendUnit: "seconds",
metricName: "pilot_proxy_convergence_time_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "pilot_xds_connected",
title: "Connected Proxies",
description:
"Number of Envoy proxies currently connected to Pilot (istiod).",
legend: "Proxies",
legendUnit: "",
metricName: "pilot_xds",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "pilot_conflict_inbound",
title: "Inbound Listener Conflicts",
description:
"Number of inbound listener conflicts. Indicates overlapping port configurations.",
legend: "Conflicts",
legendUnit: "",
metricName: "pilot_conflict_inbound_listener",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "pilot_conflict_outbound",
title: "Outbound Listener Conflicts",
description:
"Number of outbound listener conflicts between services.",
legend: "Conflicts",
legendUnit: "",
metricName: "pilot_conflict_outbound_listener_tcp_over_current_tcp",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
// ──────────────────────────────────────────────────────────────────────────────
// Envoy Proxy metric specs
// ──────────────────────────────────────────────────────────────────────────────
function getEnvoyQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "envoy_cluster_upstream_cx_total",
title: "Upstream Connections",
description:
"Total upstream connections initiated by Envoy proxies to backend services.",
legend: "Connections",
legendUnit: "",
metricName: "envoy_cluster_upstream_cx_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "envoy_cluster_upstream_cx_active",
title: "Active Upstream Connections",
description:
"Currently active upstream connections. Shows real-time connection pool utilization.",
legend: "Active",
legendUnit: "",
metricName: "envoy_cluster_upstream_cx_active",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "envoy_cluster_upstream_rq_timeout",
title: "Upstream Request Timeouts",
description:
"Requests that timed out waiting for an upstream response. Indicates slow or unresponsive services.",
legend: "Timeouts",
legendUnit: "",
metricName: "envoy_cluster_upstream_rq_timeout",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "envoy_cluster_upstream_rq_retry",
title: "Upstream Retries",
description:
"Number of request retries to upstream services. High retries indicate flaky backends.",
legend: "Retries",
legendUnit: "",
metricName: "envoy_cluster_upstream_rq_retry",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "envoy_cluster_upstream_cx_connect_fail",
title: "Connection Failures",
description:
"Failed upstream connection attempts. Indicates network or service availability issues.",
legend: "Failures",
legendUnit: "",
metricName: "envoy_cluster_upstream_cx_connect_fail",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
// ──────────────────────────────────────────────────────────────────────────────
// Linkerd metric specs
// ──────────────────────────────────────────────────────────────────────────────
function getLinkerdQueries(cluster: string): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "linkerd_request_total",
title: "Request Rate",
description:
"Total request throughput across all Linkerd proxy sidecars.",
legend: "Requests",
legendUnit: "req/s",
metricName: "request_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "linkerd_response_latency",
title: "Response Latency",
description:
"End-to-end response latency measured at the Linkerd proxy.",
legend: "Latency",
legendUnit: "ms",
metricName: "response_latency_ms_sum",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "linkerd_tcp_open_connections",
title: "Open TCP Connections",
description:
"Number of currently open TCP connections managed by Linkerd proxies.",
legend: "Connections",
legendUnit: "",
metricName: "tcp_open_connections",
aggregation: AggregationType.Avg,
},
cluster,
),
buildQuery(
{
variable: "linkerd_tcp_read_bytes",
title: "TCP Read Bytes",
description:
"Total bytes read from TCP connections by Linkerd proxies.",
legend: "Read",
legendUnit: "",
metricName: "tcp_read_bytes_total",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "linkerd_tcp_write_bytes",
title: "TCP Write Bytes",
description:
"Total bytes written to TCP connections by Linkerd proxies.",
legend: "Written",
legendUnit: "",
metricName: "tcp_write_bytes_total",
aggregation: AggregationType.Sum,
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
},
cluster,
),
buildQuery(
{
variable: "linkerd_tcp_open_total",
title: "TCP Connections Opened",
description:
"Total TCP connections opened over time by Linkerd proxies.",
legend: "Opened",
legendUnit: "",
metricName: "tcp_open_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "linkerd_tcp_close_total",
title: "TCP Connections Closed",
description:
"Total TCP connections closed. Compare with opened to detect leaks.",
legend: "Closed",
legendUnit: "",
metricName: "tcp_close_total",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
// ──────────────────────────────────────────────────────────────────────────────
// Linkerd Control Plane metric specs
// ──────────────────────────────────────────────────────────────────────────────
function getLinkerdControlPlaneQueries(
cluster: string,
): Array<MetricQueryConfigData> {
return [
buildQuery(
{
variable: "linkerd_identity_certs_issued",
title: "mTLS Certificates Issued",
description:
"Number of mTLS identity certificates issued by the Linkerd identity service.",
legend: "Certificates",
legendUnit: "",
metricName: "identity_cert_rotation_count",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "linkerd_destination_gets",
title: "Destination Lookups",
description:
"Service discovery lookups to the destination controller. Shows mesh routing activity.",
legend: "Lookups",
legendUnit: "",
metricName: "destination_get_total",
aggregation: AggregationType.Sum,
},
cluster,
),
buildQuery(
{
variable: "linkerd_proxy_injector",
title: "Proxy Injections",
description:
"Number of proxy sidecar injections performed by the webhook.",
legend: "Injections",
legendUnit: "",
metricName: "proxy_injector_injection_total",
aggregation: AggregationType.Sum,
},
cluster,
),
];
}
// ──────────────────────────────────────────────────────────────────────────────
// Main component
// ──────────────────────────────────────────────────────────────────────────────
const KubernetesClusterServiceMesh: FunctionComponent<
PageComponentProps
@@ -30,14 +562,28 @@ const KubernetesClusterServiceMesh: FunctionComponent<
const [cluster, setCluster] = useState<KubernetesCluster | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(true);
const [error, setError] = useState<string>("");
const [istioRequestsMetricViewData, setIstioRequestsMetricViewData] =
useState<MetricViewData | null>(null);
const [istioLatencyMetricViewData, setIstioLatencyMetricViewData] =
useState<MetricViewData | null>(null);
const [linkerdRequestsMetricViewData, setLinkerdRequestsMetricViewData] =
useState<MetricViewData | null>(null);
const [linkerdLatencyMetricViewData, setLinkerdLatencyMetricViewData] =
useState<MetricViewData | null>(null);
const [timeRange, setTimeRange] = useState<RangeStartAndEndDateTime>({
range: TimeRange.PAST_ONE_HOUR,
});
const [startAndEndDate, setStartAndEndDate] = useState<InBetween<Date>>(
RangeStartAndEndDateTimeUtil.getStartAndEndDate({
range: TimeRange.PAST_ONE_HOUR,
}),
);
const handleTimeRangeChange: (
newTimeRange: RangeStartAndEndDateTime,
) => void = useCallback(
(newTimeRange: RangeStartAndEndDateTime): void => {
setTimeRange(newTimeRange);
setStartAndEndDate(
RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange),
);
},
[],
);
const fetchCluster: PromiseVoidFunction = async (): Promise<void> => {
setIsLoading(true);
@@ -62,137 +608,6 @@ const KubernetesClusterServiceMesh: FunctionComponent<
});
}, []);
useEffect(() => {
if (!cluster) {
return;
}
const clusterIdentifier: string = cluster.clusterIdentifier || "";
const endDate: Date = OneUptimeDate.getCurrentDate();
const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6);
const startAndEndDate: InBetween<Date> = new InBetween(startDate, endDate);
// Istio metrics
const istioRequestsTotalQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "istio_requests_total",
title: "Istio Request Rate",
description: "Total requests through the Istio service mesh",
legend: "Requests",
legendUnit: "req/s",
},
metricQueryData: {
filterData: {
metricName: "istio_requests_total",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Sum,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
const istioRequestDurationQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "istio_request_duration",
title: "Istio Request Latency",
description:
"Request duration through the Istio service mesh (p50/p99)",
legend: "Latency",
legendUnit: "ms",
},
metricQueryData: {
filterData: {
metricName: "istio_request_duration_milliseconds_bucket",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
// Linkerd metrics
const linkerdRequestTotalQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "linkerd_request_total",
title: "Linkerd Request Rate",
description: "Total requests through the Linkerd service mesh",
legend: "Requests",
legendUnit: "req/s",
},
metricQueryData: {
filterData: {
metricName: "request_total",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Sum,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
const linkerdResponseLatencyQuery: MetricQueryConfigData = {
metricAliasData: {
metricVariable: "linkerd_response_latency",
title: "Linkerd Response Latency",
description:
"Response latency through the Linkerd service mesh (p50/p99)",
legend: "Latency",
legendUnit: "ms",
},
metricQueryData: {
filterData: {
metricName: "response_latency_ms_bucket",
attributes: {
"resource.k8s.cluster.name": clusterIdentifier,
},
aggegationType: AggregationType.Avg,
aggregateBy: {},
},
groupBy: {
attributes: true,
},
},
};
setIstioRequestsMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [istioRequestsTotalQuery],
formulaConfigs: [],
});
setIstioLatencyMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [istioRequestDurationQuery],
formulaConfigs: [],
});
setLinkerdRequestsMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [linkerdRequestTotalQuery],
formulaConfigs: [],
});
setLinkerdLatencyMetricViewData({
startAndEndDate: startAndEndDate,
queryConfigs: [linkerdResponseLatencyQuery],
formulaConfigs: [],
});
}, [cluster]);
if (isLoading) {
return <PageLoader isVisible={true} />;
}
@@ -201,71 +616,126 @@ const KubernetesClusterServiceMesh: FunctionComponent<
return <ErrorMessage message={error} />;
}
if (
!cluster ||
!istioRequestsMetricViewData ||
!istioLatencyMetricViewData ||
!linkerdRequestsMetricViewData ||
!linkerdLatencyMetricViewData
) {
if (!cluster) {
return <ErrorMessage message="Cluster not found." />;
}
const clusterIdentifier: string = cluster.clusterIdentifier || "";
// Build all metric view data
const istioData: MetricViewData = buildMetricViewData(
getIstioQueries(clusterIdentifier),
startAndEndDate,
);
const istioPilotData: MetricViewData = buildMetricViewData(
getIstioPilotQueries(clusterIdentifier),
startAndEndDate,
);
const envoyData: MetricViewData = buildMetricViewData(
getEnvoyQueries(clusterIdentifier),
startAndEndDate,
);
const linkerdData: MetricViewData = buildMetricViewData(
getLinkerdQueries(clusterIdentifier),
startAndEndDate,
);
const linkerdControlPlaneData: MetricViewData = buildMetricViewData(
getLinkerdControlPlaneQueries(clusterIdentifier),
startAndEndDate,
);
return (
<Fragment>
<div className="mb-4 p-4 bg-blue-50 border border-blue-200 rounded-lg">
<p className="text-sm text-blue-700">
Service mesh metrics require the <code>serviceMesh.enabled</code> flag
to be set to <code>true</code> and the{" "}
<code>serviceMesh.provider</code> to be configured in the
kubernetes-agent Helm chart values. Supported providers are Istio and
Linkerd.
{/* Info banner */}
<div className="mb-5 flex items-start gap-3 p-4 bg-blue-50 border border-blue-200 rounded-xl">
<div className="flex-shrink-0 mt-0.5">
<Icon icon={IconProp.Info} className="h-5 w-5 text-blue-500" />
</div>
<div>
<p className="text-sm font-medium text-blue-800">
Service Mesh Metrics Configuration
</p>
<p className="mt-1 text-sm text-blue-600">
Service mesh metrics require{" "}
<code className="px-1 py-0.5 bg-blue-100 rounded text-xs font-mono">
serviceMesh.enabled: true
</code>{" "}
and{" "}
<code className="px-1 py-0.5 bg-blue-100 rounded text-xs font-mono">
serviceMesh.provider
</code>{" "}
to be configured in the kubernetes-agent Helm chart values.
Supported providers are Istio and Linkerd. Only the sections
matching your provider will show data.
</p>
</div>
</div>
{/* Global time range picker */}
<div className="mb-5 flex items-center justify-end">
<RangeStartAndEndDateView
dashboardStartAndEndDate={timeRange}
onChange={handleTimeRangeChange}
/>
</div>
{/* ── Istio Sections ─────────────────────────────────────────────── */}
<div className="mb-2">
<h2 className="text-lg font-semibold text-gray-900">Istio</h2>
<p className="text-sm text-gray-500">
Metrics from Istio Envoy sidecars and Pilot (istiod) control plane.
</p>
</div>
<Card
title="Istio - Request Rate"
description="Total request rate through Istio envoy sidecars across all services in the mesh."
>
<MetricView
data={istioRequestsMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
{/* Istio Data Plane */}
<MeshSection
title="Data Plane — Traffic"
description="HTTP and TCP traffic flowing through Envoy sidecar proxies. Covers request throughput, latency, payload sizes, and connection lifecycle."
icon={IconProp.ArrowCircleRight}
data={istioData}
/>
<Card
title="Istio - Request Latency"
description="Request duration distribution through the Istio service mesh."
>
<MetricView
data={istioLatencyMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
{/* Istio Control Plane (Pilot / istiod) */}
<MeshSection
title="Control Plane — Pilot (istiod)"
description="Istio Pilot manages xDS configuration distribution to all Envoy proxies. Monitors push throughput, errors, convergence time, and listener conflicts."
icon={IconProp.Settings}
data={istioPilotData}
/>
<Card
title="Linkerd - Request Rate"
description="Total request rate through Linkerd proxy sidecars across all services in the mesh."
>
<MetricView
data={linkerdRequestsMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
{/* Envoy Proxy internals */}
<MeshSection
title="Envoy Proxy"
description="Low-level Envoy sidecar proxy metrics. Tracks upstream connection pools, request timeouts, retries, and connection failures."
icon={IconProp.Globe}
data={envoyData}
/>
<Card
title="Linkerd - Response Latency"
description="Response latency distribution through the Linkerd service mesh."
>
<MetricView
data={linkerdLatencyMetricViewData}
hideQueryElements={true}
onChange={() => {}}
/>
</Card>
{/* ── Linkerd Sections ───────────────────────────────────────────── */}
<div className="mb-2 mt-6">
<h2 className="text-lg font-semibold text-gray-900">Linkerd</h2>
<p className="text-sm text-gray-500">
Metrics from Linkerd proxy sidecars and control plane components.
</p>
</div>
{/* Linkerd Data Plane */}
<MeshSection
title="Data Plane — Traffic"
description="Request throughput, response latency, and TCP connection metrics from Linkerd proxy sidecars."
icon={IconProp.ArrowCircleRight}
data={linkerdData}
/>
{/* Linkerd Control Plane */}
<MeshSection
title="Control Plane"
description="Linkerd control plane components: identity (mTLS certificate issuance), destination (service discovery), and proxy injector (sidecar injection)."
icon={IconProp.Settings}
data={linkerdControlPlaneData}
/>
</Fragment>
);
};