mirror of
https://github.com/OneUptime/oneuptime.git
synced 2026-04-06 00:32:12 +02:00
feat: enhance Kubernetes service mesh metrics view with Istio and Linkerd support
- Introduced new metric query specifications for Istio and Linkerd. - Refactored metric data fetching to utilize a unified query builder. - Added a global time range picker for metrics visualization. - Created reusable MeshSection component for displaying metrics. - Updated UI to present Istio and Linkerd metrics in organized sections.
This commit is contained in:
@@ -7,12 +7,13 @@ import MetricView from "../../../Components/Metrics/MetricView";
|
||||
import MetricViewData from "Common/Types/Metrics/MetricViewData";
|
||||
import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData";
|
||||
import AggregationType from "Common/Types/BaseDatabase/AggregationType";
|
||||
import OneUptimeDate from "Common/Types/Date";
|
||||
import InBetween from "Common/Types/BaseDatabase/InBetween";
|
||||
import IconProp from "Common/Types/Icon/IconProp";
|
||||
import Icon from "Common/UI/Components/Icon/Icon";
|
||||
import React, {
|
||||
Fragment,
|
||||
FunctionComponent,
|
||||
ReactElement,
|
||||
useCallback,
|
||||
useEffect,
|
||||
useState,
|
||||
} from "react";
|
||||
@@ -22,6 +23,435 @@ import PageLoader from "Common/UI/Components/Loader/PageLoader";
|
||||
import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage";
|
||||
import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
|
||||
import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils";
|
||||
import RangeStartAndEndDateTime, {
|
||||
RangeStartAndEndDateTimeUtil,
|
||||
} from "Common/Types/Time/RangeStartAndEndDateTime";
|
||||
import TimeRange from "Common/Types/Time/TimeRange";
|
||||
import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView";
|
||||
import InBetween from "Common/Types/BaseDatabase/InBetween";
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Query builder helper
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface MetricQuerySpec {
|
||||
variable: string;
|
||||
title: string;
|
||||
description: string;
|
||||
legend: string;
|
||||
legendUnit: string;
|
||||
metricName: string;
|
||||
aggregation: AggregationType;
|
||||
yAxisFormatter?: (value: number) => string;
|
||||
}
|
||||
|
||||
function buildQuery(
|
||||
spec: MetricQuerySpec,
|
||||
clusterIdentifier: string,
|
||||
): MetricQueryConfigData {
|
||||
return {
|
||||
metricAliasData: {
|
||||
metricVariable: spec.variable,
|
||||
title: spec.title,
|
||||
description: spec.description,
|
||||
legend: spec.legend,
|
||||
legendUnit: spec.legendUnit,
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: spec.metricName,
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: spec.aggregation,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
yAxisValueFormatter: spec.yAxisFormatter,
|
||||
};
|
||||
}
|
||||
|
||||
function buildMetricViewData(
|
||||
queries: Array<MetricQueryConfigData>,
|
||||
startAndEndDate: InBetween<Date>,
|
||||
): MetricViewData {
|
||||
return {
|
||||
startAndEndDate,
|
||||
queryConfigs: queries,
|
||||
formulaConfigs: [],
|
||||
};
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Section component — one Card per control plane component
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface SectionProps {
|
||||
title: string;
|
||||
description: string;
|
||||
icon: IconProp;
|
||||
data: MetricViewData;
|
||||
}
|
||||
|
||||
const ControlPlaneSection: FunctionComponent<SectionProps> = (
|
||||
props: SectionProps,
|
||||
): ReactElement => {
|
||||
return (
|
||||
<Card
|
||||
title={
|
||||
<div className="flex items-center gap-2">
|
||||
<Icon
|
||||
icon={props.icon}
|
||||
className="h-5 w-5 text-gray-500"
|
||||
/>
|
||||
<span>{props.title}</span>
|
||||
</div>
|
||||
}
|
||||
description={props.description}
|
||||
>
|
||||
<MetricView
|
||||
data={props.data}
|
||||
hideQueryElements={true}
|
||||
hideStartAndEndDate={true}
|
||||
hideCardInCharts={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
);
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Metric specs per control plane component
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getEtcdQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_db_size",
|
||||
title: "Database Size",
|
||||
description: "Total size of the etcd MVCC database on disk.",
|
||||
legend: "DB Size",
|
||||
legendUnit: "",
|
||||
metricName: "etcd_mvcc_db_total_size_in_bytes",
|
||||
aggregation: AggregationType.Avg,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_db_size_in_use",
|
||||
title: "Database Size In Use",
|
||||
description: "Actual used size of the etcd database (after compaction).",
|
||||
legend: "In Use",
|
||||
legendUnit: "",
|
||||
metricName: "etcd_mvcc_db_total_size_in_use_in_bytes",
|
||||
aggregation: AggregationType.Avg,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_keys_total",
|
||||
title: "Total Keys",
|
||||
description: "Total number of keys stored in etcd.",
|
||||
legend: "Keys",
|
||||
legendUnit: "",
|
||||
metricName: "etcd_debugging_mvcc_keys_total",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_leader_changes",
|
||||
title: "Leader Changes",
|
||||
description:
|
||||
"Number of leader changes seen. Frequent changes indicate instability.",
|
||||
legend: "Changes",
|
||||
legendUnit: "",
|
||||
metricName: "etcd_server_leader_changes_seen_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_proposals_failed",
|
||||
title: "Failed Proposals",
|
||||
description: "Total number of failed raft proposals.",
|
||||
legend: "Failed",
|
||||
legendUnit: "",
|
||||
metricName: "etcd_server_proposals_failed_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_disk_wal_fsync",
|
||||
title: "WAL Fsync Duration",
|
||||
description:
|
||||
"Latency of WAL fsync operations. High values cause slow commits.",
|
||||
legend: "Duration",
|
||||
legendUnit: "seconds",
|
||||
metricName: "etcd_disk_wal_fsync_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_disk_backend_commit",
|
||||
title: "Backend Commit Duration",
|
||||
description: "Latency of backend commit operations.",
|
||||
legend: "Duration",
|
||||
legendUnit: "seconds",
|
||||
metricName: "etcd_disk_backend_commit_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "etcd_network_peer_sent",
|
||||
title: "Network Peer Sent Bytes",
|
||||
description: "Total bytes sent to peers. Shows replication traffic.",
|
||||
legend: "Sent",
|
||||
legendUnit: "",
|
||||
metricName: "etcd_network_peer_sent_bytes_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
function getApiServerQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "apiserver_requests",
|
||||
title: "Request Rate",
|
||||
description: "Total API server requests.",
|
||||
legend: "Requests",
|
||||
legendUnit: "req/s",
|
||||
metricName: "apiserver_request_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "apiserver_latency",
|
||||
title: "Request Latency",
|
||||
description: "Average API server request duration.",
|
||||
legend: "Latency",
|
||||
legendUnit: "seconds",
|
||||
metricName: "apiserver_request_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "apiserver_errors",
|
||||
title: "Request Errors (5xx)",
|
||||
description:
|
||||
"API server 5xx error count. Non-zero values need investigation.",
|
||||
legend: "Errors",
|
||||
legendUnit: "",
|
||||
metricName: "apiserver_request_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "apiserver_inflight_requests",
|
||||
title: "In-Flight Requests",
|
||||
description:
|
||||
"Current number of in-flight requests being processed. High counts indicate API server saturation.",
|
||||
legend: "Requests",
|
||||
legendUnit: "",
|
||||
metricName: "apiserver_current_inflight_requests",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "apiserver_audit_events",
|
||||
title: "Audit Events",
|
||||
description:
|
||||
"Number of audit events processed. Monitors audit pipeline throughput.",
|
||||
legend: "Events",
|
||||
legendUnit: "",
|
||||
metricName: "apiserver_audit_event_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "apiserver_tls_handshake_errors",
|
||||
title: "TLS Handshake Errors",
|
||||
description:
|
||||
"Number of TLS handshake errors. Indicates certificate or connectivity issues.",
|
||||
legend: "Errors",
|
||||
legendUnit: "",
|
||||
metricName: "apiserver_tls_handshake_errors_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
function getSchedulerQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "scheduler_pending",
|
||||
title: "Pending Pods",
|
||||
description:
|
||||
"Number of pods waiting to be scheduled. Sustained values indicate scheduling pressure.",
|
||||
legend: "Pending Pods",
|
||||
legendUnit: "",
|
||||
metricName: "scheduler_pending_pods",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "scheduler_latency",
|
||||
title: "Scheduling Latency",
|
||||
description: "End-to-end scheduling duration from pod creation to binding.",
|
||||
legend: "Latency",
|
||||
legendUnit: "seconds",
|
||||
metricName: "scheduler_e2e_scheduling_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "scheduler_attempts",
|
||||
title: "Scheduling Attempts",
|
||||
description: "Number of scheduling attempts by result (scheduled, unschedulable, error).",
|
||||
legend: "Attempts",
|
||||
legendUnit: "",
|
||||
metricName: "scheduler_schedule_attempts_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "scheduler_preemptions",
|
||||
title: "Preemption Attempts",
|
||||
description: "Number of preemption attempts to free resources for higher-priority pods.",
|
||||
legend: "Preemptions",
|
||||
legendUnit: "",
|
||||
metricName: "scheduler_preemption_attempts_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "scheduler_queue_duration",
|
||||
title: "Queue Wait Duration",
|
||||
description: "Time pods spend in the scheduling queue before being scheduled.",
|
||||
legend: "Duration",
|
||||
legendUnit: "seconds",
|
||||
metricName: "scheduler_scheduling_attempt_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
function getControllerManagerQueries(
|
||||
cluster: string,
|
||||
): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "controller_queue_depth",
|
||||
title: "Work Queue Depth",
|
||||
description:
|
||||
"Current depth of controller work queues. High values mean controllers are falling behind.",
|
||||
legend: "Queue Depth",
|
||||
legendUnit: "",
|
||||
metricName: "workqueue_depth",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "controller_queue_latency",
|
||||
title: "Queue Latency",
|
||||
description: "Time items spend waiting in the work queue before processing.",
|
||||
legend: "Latency",
|
||||
legendUnit: "seconds",
|
||||
metricName: "workqueue_queue_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "controller_work_duration",
|
||||
title: "Work Duration",
|
||||
description: "Time spent processing a single work queue item.",
|
||||
legend: "Duration",
|
||||
legendUnit: "seconds",
|
||||
metricName: "workqueue_work_duration_seconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "controller_retries",
|
||||
title: "Retries",
|
||||
description:
|
||||
"Number of work queue item retries. Frequent retries indicate reconciliation failures.",
|
||||
legend: "Retries",
|
||||
legendUnit: "",
|
||||
metricName: "workqueue_retries_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "controller_adds",
|
||||
title: "Queue Additions",
|
||||
description: "Rate of items being added to work queues.",
|
||||
legend: "Adds",
|
||||
legendUnit: "",
|
||||
metricName: "workqueue_adds_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Main component
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
const KubernetesClusterControlPlane: FunctionComponent<
|
||||
PageComponentProps
|
||||
@@ -31,14 +461,28 @@ const KubernetesClusterControlPlane: FunctionComponent<
|
||||
const [cluster, setCluster] = useState<KubernetesCluster | null>(null);
|
||||
const [isLoading, setIsLoading] = useState<boolean>(true);
|
||||
const [error, setError] = useState<string>("");
|
||||
const [etcdMetricViewData, setEtcdMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
const [apiServerMetricViewData, setApiServerMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
const [schedulerMetricViewData, setSchedulerMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
const [controllerMetricViewData, setControllerMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
|
||||
const [timeRange, setTimeRange] = useState<RangeStartAndEndDateTime>({
|
||||
range: TimeRange.PAST_ONE_HOUR,
|
||||
});
|
||||
|
||||
const [startAndEndDate, setStartAndEndDate] = useState<InBetween<Date>>(
|
||||
RangeStartAndEndDateTimeUtil.getStartAndEndDate({
|
||||
range: TimeRange.PAST_ONE_HOUR,
|
||||
}),
|
||||
);
|
||||
|
||||
const handleTimeRangeChange: (
|
||||
newTimeRange: RangeStartAndEndDateTime,
|
||||
) => void = useCallback(
|
||||
(newTimeRange: RangeStartAndEndDateTime): void => {
|
||||
setTimeRange(newTimeRange);
|
||||
setStartAndEndDate(
|
||||
RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange),
|
||||
);
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
const fetchCluster: PromiseVoidFunction = async (): Promise<void> => {
|
||||
setIsLoading(true);
|
||||
@@ -63,180 +507,6 @@ const KubernetesClusterControlPlane: FunctionComponent<
|
||||
});
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (!cluster) {
|
||||
return;
|
||||
}
|
||||
|
||||
const clusterIdentifier: string = cluster.clusterIdentifier || "";
|
||||
const endDate: Date = OneUptimeDate.getCurrentDate();
|
||||
const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6);
|
||||
const startAndEndDate: InBetween<Date> = new InBetween(startDate, endDate);
|
||||
|
||||
const etcdDbSizeQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "etcd_db_size",
|
||||
title: "etcd Database Size",
|
||||
description: "Total size of the etcd database",
|
||||
legend: "DB Size",
|
||||
legendUnit: "",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "etcd_mvcc_db_total_size_in_bytes",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
yAxisValueFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
};
|
||||
|
||||
const apiServerRequestRateQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "apiserver_requests",
|
||||
title: "API Server Request Rate",
|
||||
description: "Total API server requests by verb",
|
||||
legend: "Requests",
|
||||
legendUnit: "req/s",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "apiserver_request_total",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Sum,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const apiServerLatencyQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "apiserver_latency",
|
||||
title: "API Server Request Latency",
|
||||
description: "API server request duration",
|
||||
legend: "Latency",
|
||||
legendUnit: "seconds",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "apiserver_request_duration_seconds",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const schedulerPendingQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "scheduler_pending",
|
||||
title: "Scheduler Pending Pods",
|
||||
description: "Number of pods pending scheduling",
|
||||
legend: "Pending Pods",
|
||||
legendUnit: "",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "scheduler_pending_pods",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const schedulerLatencyQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "scheduler_latency",
|
||||
title: "Scheduler Latency",
|
||||
description: "End-to-end scheduling latency",
|
||||
legend: "Latency",
|
||||
legendUnit: "seconds",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "scheduler_e2e_scheduling_duration_seconds",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const controllerQueueDepthQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "controller_queue",
|
||||
title: "Controller Manager Queue Depth",
|
||||
description: "Work queue depth for controller manager",
|
||||
legend: "Queue Depth",
|
||||
legendUnit: "",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "workqueue_depth",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
setEtcdMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [etcdDbSizeQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
|
||||
setApiServerMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [apiServerRequestRateQuery, apiServerLatencyQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
|
||||
setSchedulerMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [schedulerPendingQuery, schedulerLatencyQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
|
||||
setControllerMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [controllerQueueDepthQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
}, [cluster]);
|
||||
|
||||
if (isLoading) {
|
||||
return <PageLoader isVisible={true} />;
|
||||
}
|
||||
@@ -245,70 +515,95 @@ const KubernetesClusterControlPlane: FunctionComponent<
|
||||
return <ErrorMessage message={error} />;
|
||||
}
|
||||
|
||||
if (
|
||||
!cluster ||
|
||||
!etcdMetricViewData ||
|
||||
!apiServerMetricViewData ||
|
||||
!schedulerMetricViewData ||
|
||||
!controllerMetricViewData
|
||||
) {
|
||||
if (!cluster) {
|
||||
return <ErrorMessage message="Cluster not found." />;
|
||||
}
|
||||
|
||||
const clusterIdentifier: string = cluster.clusterIdentifier || "";
|
||||
|
||||
// Build all metric view data sections
|
||||
const etcdData: MetricViewData = buildMetricViewData(
|
||||
getEtcdQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const apiServerData: MetricViewData = buildMetricViewData(
|
||||
getApiServerQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const schedulerData: MetricViewData = buildMetricViewData(
|
||||
getSchedulerQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const controllerData: MetricViewData = buildMetricViewData(
|
||||
getControllerManagerQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
|
||||
return (
|
||||
<Fragment>
|
||||
<div className="mb-4 p-4 bg-blue-50 border border-blue-200 rounded-lg">
|
||||
<p className="text-sm text-blue-700">
|
||||
Control plane metrics require the <code>controlPlane.enabled</code>{" "}
|
||||
flag to be set to <code>true</code> in the kubernetes-agent Helm chart
|
||||
values. This is typically only available for self-managed Kubernetes
|
||||
clusters, not managed services like EKS, GKE, or AKS.
|
||||
</p>
|
||||
{/* Info banner */}
|
||||
<div className="mb-5 flex items-start gap-3 p-4 bg-blue-50 border border-blue-200 rounded-xl">
|
||||
<div className="flex-shrink-0 mt-0.5">
|
||||
<Icon
|
||||
icon={IconProp.Info}
|
||||
className="h-5 w-5 text-blue-500"
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<p className="text-sm font-medium text-blue-800">
|
||||
Control Plane Metrics Configuration
|
||||
</p>
|
||||
<p className="mt-1 text-sm text-blue-600">
|
||||
Control plane metrics require{" "}
|
||||
<code className="px-1 py-0.5 bg-blue-100 rounded text-xs font-mono">
|
||||
controlPlane.enabled: true
|
||||
</code>{" "}
|
||||
in the kubernetes-agent Helm chart values. This is typically only
|
||||
available for self-managed clusters, not managed services like EKS,
|
||||
GKE, or AKS.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<Card
|
||||
{/* Global time range picker */}
|
||||
<div className="mb-5 flex items-center justify-end">
|
||||
<RangeStartAndEndDateView
|
||||
dashboardStartAndEndDate={timeRange}
|
||||
onChange={handleTimeRangeChange}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* etcd */}
|
||||
<ControlPlaneSection
|
||||
title="etcd"
|
||||
description="etcd is the consistent, distributed key-value store used as the backing store for all cluster data."
|
||||
>
|
||||
<MetricView
|
||||
data={etcdMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
description="Distributed key-value store backing all cluster state. Monitors database size, disk I/O latency, leader stability, and replication health."
|
||||
icon={IconProp.Database}
|
||||
data={etcdData}
|
||||
/>
|
||||
|
||||
<Card
|
||||
{/* API Server */}
|
||||
<ControlPlaneSection
|
||||
title="API Server"
|
||||
description="The Kubernetes API server validates and configures data for API objects and serves REST operations."
|
||||
>
|
||||
<MetricView
|
||||
data={apiServerMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
description="Central management entity that validates and serves all REST operations. Tracks request throughput, latency, error rates, and connection health."
|
||||
icon={IconProp.Globe}
|
||||
data={apiServerData}
|
||||
/>
|
||||
|
||||
<Card
|
||||
{/* Scheduler */}
|
||||
<ControlPlaneSection
|
||||
title="Scheduler"
|
||||
description="The scheduler watches for newly created pods that have no node assigned and selects a node for them to run on."
|
||||
>
|
||||
<MetricView
|
||||
data={schedulerMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
description="Assigns pods to nodes based on resource requirements, affinity, and constraints. Monitors scheduling throughput, queue pressure, and preemption activity."
|
||||
icon={IconProp.AdjustmentHorizontal}
|
||||
data={schedulerData}
|
||||
/>
|
||||
|
||||
<Card
|
||||
{/* Controller Manager */}
|
||||
<ControlPlaneSection
|
||||
title="Controller Manager"
|
||||
description="The controller manager runs controller processes that regulate the state of the cluster."
|
||||
>
|
||||
<MetricView
|
||||
data={controllerMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
description="Runs core control loops that reconcile cluster state. Tracks work queue depth, processing latency, retries, and throughput across all controllers."
|
||||
icon={IconProp.Settings}
|
||||
data={controllerData}
|
||||
/>
|
||||
</Fragment>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -7,12 +7,14 @@ import MetricView from "../../../Components/Metrics/MetricView";
|
||||
import MetricViewData from "Common/Types/Metrics/MetricViewData";
|
||||
import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData";
|
||||
import AggregationType from "Common/Types/BaseDatabase/AggregationType";
|
||||
import OneUptimeDate from "Common/Types/Date";
|
||||
import InBetween from "Common/Types/BaseDatabase/InBetween";
|
||||
import IconProp from "Common/Types/Icon/IconProp";
|
||||
import Icon from "Common/UI/Components/Icon/Icon";
|
||||
import React, {
|
||||
Fragment,
|
||||
FunctionComponent,
|
||||
ReactElement,
|
||||
useCallback,
|
||||
useEffect,
|
||||
useState,
|
||||
} from "react";
|
||||
@@ -21,6 +23,536 @@ import API from "Common/UI/Utils/API/API";
|
||||
import PageLoader from "Common/UI/Components/Loader/PageLoader";
|
||||
import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage";
|
||||
import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
|
||||
import RangeStartAndEndDateTime, {
|
||||
RangeStartAndEndDateTimeUtil,
|
||||
} from "Common/Types/Time/RangeStartAndEndDateTime";
|
||||
import TimeRange from "Common/Types/Time/TimeRange";
|
||||
import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView";
|
||||
import KubernetesResourceUtils from "../Utils/KubernetesResourceUtils";
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Query builder helper
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface MetricQuerySpec {
|
||||
variable: string;
|
||||
title: string;
|
||||
description: string;
|
||||
legend: string;
|
||||
legendUnit: string;
|
||||
metricName: string;
|
||||
aggregation: AggregationType;
|
||||
yAxisFormatter?: (value: number) => string;
|
||||
}
|
||||
|
||||
function buildQuery(
|
||||
spec: MetricQuerySpec,
|
||||
clusterIdentifier: string,
|
||||
): MetricQueryConfigData {
|
||||
return {
|
||||
metricAliasData: {
|
||||
metricVariable: spec.variable,
|
||||
title: spec.title,
|
||||
description: spec.description,
|
||||
legend: spec.legend,
|
||||
legendUnit: spec.legendUnit,
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: spec.metricName,
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: spec.aggregation,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
yAxisValueFormatter: spec.yAxisFormatter,
|
||||
};
|
||||
}
|
||||
|
||||
function buildMetricViewData(
|
||||
queries: Array<MetricQueryConfigData>,
|
||||
startAndEndDate: InBetween<Date>,
|
||||
): MetricViewData {
|
||||
return {
|
||||
startAndEndDate,
|
||||
queryConfigs: queries,
|
||||
formulaConfigs: [],
|
||||
};
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Section component
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface SectionProps {
|
||||
title: string;
|
||||
description: string;
|
||||
icon: IconProp;
|
||||
data: MetricViewData;
|
||||
}
|
||||
|
||||
const MeshSection: FunctionComponent<SectionProps> = (
|
||||
props: SectionProps,
|
||||
): ReactElement => {
|
||||
return (
|
||||
<Card
|
||||
title={
|
||||
<div className="flex items-center gap-2">
|
||||
<Icon icon={props.icon} className="h-5 w-5 text-gray-500" />
|
||||
<span>{props.title}</span>
|
||||
</div>
|
||||
}
|
||||
description={props.description}
|
||||
>
|
||||
<MetricView
|
||||
data={props.data}
|
||||
hideQueryElements={true}
|
||||
hideStartAndEndDate={true}
|
||||
hideCardInCharts={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
);
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Istio metric specs
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getIstioQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_requests_total",
|
||||
title: "Request Rate",
|
||||
description:
|
||||
"Total request throughput across all Envoy sidecars in the mesh.",
|
||||
legend: "Requests",
|
||||
legendUnit: "req/s",
|
||||
metricName: "istio_requests_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_request_duration",
|
||||
title: "Request Duration",
|
||||
description:
|
||||
"Average request latency through the mesh. High values indicate service or network slowness.",
|
||||
legend: "Duration",
|
||||
legendUnit: "ms",
|
||||
metricName: "istio_request_duration_milliseconds_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_request_bytes",
|
||||
title: "Request Size",
|
||||
description:
|
||||
"Size of HTTP request bodies flowing through the mesh.",
|
||||
legend: "Size",
|
||||
legendUnit: "",
|
||||
metricName: "istio_request_bytes_sum",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_response_bytes",
|
||||
title: "Response Size",
|
||||
description:
|
||||
"Size of HTTP response bodies flowing through the mesh.",
|
||||
legend: "Size",
|
||||
legendUnit: "",
|
||||
metricName: "istio_response_bytes_sum",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_tcp_sent_bytes",
|
||||
title: "TCP Bytes Sent",
|
||||
description:
|
||||
"Total bytes sent over TCP connections in the mesh. Includes all L4 traffic.",
|
||||
legend: "Sent",
|
||||
legendUnit: "",
|
||||
metricName: "istio_tcp_sent_bytes_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_tcp_received_bytes",
|
||||
title: "TCP Bytes Received",
|
||||
description:
|
||||
"Total bytes received over TCP connections in the mesh.",
|
||||
legend: "Received",
|
||||
legendUnit: "",
|
||||
metricName: "istio_tcp_received_bytes_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_tcp_connections_opened",
|
||||
title: "TCP Connections Opened",
|
||||
description:
|
||||
"Number of new TCP connections opened. Spikes may indicate connection churn.",
|
||||
legend: "Opened",
|
||||
legendUnit: "",
|
||||
metricName: "istio_tcp_connections_opened_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "istio_tcp_connections_closed",
|
||||
title: "TCP Connections Closed",
|
||||
description:
|
||||
"Number of TCP connections closed. Compare with opened to detect leaks.",
|
||||
legend: "Closed",
|
||||
legendUnit: "",
|
||||
metricName: "istio_tcp_connections_closed_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Istio Pilot / Control Plane metric specs
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getIstioPilotQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "pilot_xds_pushes",
|
||||
title: "xDS Config Pushes",
|
||||
description:
|
||||
"Number of xDS configuration pushes to Envoy sidecars. High rates mean frequent config changes.",
|
||||
legend: "Pushes",
|
||||
legendUnit: "",
|
||||
metricName: "pilot_xds_pushes",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "pilot_xds_push_errors",
|
||||
title: "xDS Push Errors",
|
||||
description:
|
||||
"Configuration push failures to sidecars. Non-zero values indicate connectivity or config issues.",
|
||||
legend: "Errors",
|
||||
legendUnit: "",
|
||||
metricName: "pilot_total_xds_internal_errors",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "pilot_proxy_convergence",
|
||||
title: "Proxy Convergence Time",
|
||||
description:
|
||||
"Time for configuration changes to propagate to all Envoy proxies.",
|
||||
legend: "Duration",
|
||||
legendUnit: "seconds",
|
||||
metricName: "pilot_proxy_convergence_time_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "pilot_xds_connected",
|
||||
title: "Connected Proxies",
|
||||
description:
|
||||
"Number of Envoy proxies currently connected to Pilot (istiod).",
|
||||
legend: "Proxies",
|
||||
legendUnit: "",
|
||||
metricName: "pilot_xds",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "pilot_conflict_inbound",
|
||||
title: "Inbound Listener Conflicts",
|
||||
description:
|
||||
"Number of inbound listener conflicts. Indicates overlapping port configurations.",
|
||||
legend: "Conflicts",
|
||||
legendUnit: "",
|
||||
metricName: "pilot_conflict_inbound_listener",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "pilot_conflict_outbound",
|
||||
title: "Outbound Listener Conflicts",
|
||||
description:
|
||||
"Number of outbound listener conflicts between services.",
|
||||
legend: "Conflicts",
|
||||
legendUnit: "",
|
||||
metricName: "pilot_conflict_outbound_listener_tcp_over_current_tcp",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Envoy Proxy metric specs
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getEnvoyQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "envoy_cluster_upstream_cx_total",
|
||||
title: "Upstream Connections",
|
||||
description:
|
||||
"Total upstream connections initiated by Envoy proxies to backend services.",
|
||||
legend: "Connections",
|
||||
legendUnit: "",
|
||||
metricName: "envoy_cluster_upstream_cx_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "envoy_cluster_upstream_cx_active",
|
||||
title: "Active Upstream Connections",
|
||||
description:
|
||||
"Currently active upstream connections. Shows real-time connection pool utilization.",
|
||||
legend: "Active",
|
||||
legendUnit: "",
|
||||
metricName: "envoy_cluster_upstream_cx_active",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "envoy_cluster_upstream_rq_timeout",
|
||||
title: "Upstream Request Timeouts",
|
||||
description:
|
||||
"Requests that timed out waiting for an upstream response. Indicates slow or unresponsive services.",
|
||||
legend: "Timeouts",
|
||||
legendUnit: "",
|
||||
metricName: "envoy_cluster_upstream_rq_timeout",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "envoy_cluster_upstream_rq_retry",
|
||||
title: "Upstream Retries",
|
||||
description:
|
||||
"Number of request retries to upstream services. High retries indicate flaky backends.",
|
||||
legend: "Retries",
|
||||
legendUnit: "",
|
||||
metricName: "envoy_cluster_upstream_rq_retry",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "envoy_cluster_upstream_cx_connect_fail",
|
||||
title: "Connection Failures",
|
||||
description:
|
||||
"Failed upstream connection attempts. Indicates network or service availability issues.",
|
||||
legend: "Failures",
|
||||
legendUnit: "",
|
||||
metricName: "envoy_cluster_upstream_cx_connect_fail",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Linkerd metric specs
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getLinkerdQueries(cluster: string): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_request_total",
|
||||
title: "Request Rate",
|
||||
description:
|
||||
"Total request throughput across all Linkerd proxy sidecars.",
|
||||
legend: "Requests",
|
||||
legendUnit: "req/s",
|
||||
metricName: "request_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_response_latency",
|
||||
title: "Response Latency",
|
||||
description:
|
||||
"End-to-end response latency measured at the Linkerd proxy.",
|
||||
legend: "Latency",
|
||||
legendUnit: "ms",
|
||||
metricName: "response_latency_ms_sum",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_tcp_open_connections",
|
||||
title: "Open TCP Connections",
|
||||
description:
|
||||
"Number of currently open TCP connections managed by Linkerd proxies.",
|
||||
legend: "Connections",
|
||||
legendUnit: "",
|
||||
metricName: "tcp_open_connections",
|
||||
aggregation: AggregationType.Avg,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_tcp_read_bytes",
|
||||
title: "TCP Read Bytes",
|
||||
description:
|
||||
"Total bytes read from TCP connections by Linkerd proxies.",
|
||||
legend: "Read",
|
||||
legendUnit: "",
|
||||
metricName: "tcp_read_bytes_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_tcp_write_bytes",
|
||||
title: "TCP Write Bytes",
|
||||
description:
|
||||
"Total bytes written to TCP connections by Linkerd proxies.",
|
||||
legend: "Written",
|
||||
legendUnit: "",
|
||||
metricName: "tcp_write_bytes_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
yAxisFormatter: KubernetesResourceUtils.formatBytesForChart,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_tcp_open_total",
|
||||
title: "TCP Connections Opened",
|
||||
description:
|
||||
"Total TCP connections opened over time by Linkerd proxies.",
|
||||
legend: "Opened",
|
||||
legendUnit: "",
|
||||
metricName: "tcp_open_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_tcp_close_total",
|
||||
title: "TCP Connections Closed",
|
||||
description:
|
||||
"Total TCP connections closed. Compare with opened to detect leaks.",
|
||||
legend: "Closed",
|
||||
legendUnit: "",
|
||||
metricName: "tcp_close_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Linkerd Control Plane metric specs
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function getLinkerdControlPlaneQueries(
|
||||
cluster: string,
|
||||
): Array<MetricQueryConfigData> {
|
||||
return [
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_identity_certs_issued",
|
||||
title: "mTLS Certificates Issued",
|
||||
description:
|
||||
"Number of mTLS identity certificates issued by the Linkerd identity service.",
|
||||
legend: "Certificates",
|
||||
legendUnit: "",
|
||||
metricName: "identity_cert_rotation_count",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_destination_gets",
|
||||
title: "Destination Lookups",
|
||||
description:
|
||||
"Service discovery lookups to the destination controller. Shows mesh routing activity.",
|
||||
legend: "Lookups",
|
||||
legendUnit: "",
|
||||
metricName: "destination_get_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
buildQuery(
|
||||
{
|
||||
variable: "linkerd_proxy_injector",
|
||||
title: "Proxy Injections",
|
||||
description:
|
||||
"Number of proxy sidecar injections performed by the webhook.",
|
||||
legend: "Injections",
|
||||
legendUnit: "",
|
||||
metricName: "proxy_injector_injection_total",
|
||||
aggregation: AggregationType.Sum,
|
||||
},
|
||||
cluster,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Main component
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
const KubernetesClusterServiceMesh: FunctionComponent<
|
||||
PageComponentProps
|
||||
@@ -30,14 +562,28 @@ const KubernetesClusterServiceMesh: FunctionComponent<
|
||||
const [cluster, setCluster] = useState<KubernetesCluster | null>(null);
|
||||
const [isLoading, setIsLoading] = useState<boolean>(true);
|
||||
const [error, setError] = useState<string>("");
|
||||
const [istioRequestsMetricViewData, setIstioRequestsMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
const [istioLatencyMetricViewData, setIstioLatencyMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
const [linkerdRequestsMetricViewData, setLinkerdRequestsMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
const [linkerdLatencyMetricViewData, setLinkerdLatencyMetricViewData] =
|
||||
useState<MetricViewData | null>(null);
|
||||
|
||||
const [timeRange, setTimeRange] = useState<RangeStartAndEndDateTime>({
|
||||
range: TimeRange.PAST_ONE_HOUR,
|
||||
});
|
||||
|
||||
const [startAndEndDate, setStartAndEndDate] = useState<InBetween<Date>>(
|
||||
RangeStartAndEndDateTimeUtil.getStartAndEndDate({
|
||||
range: TimeRange.PAST_ONE_HOUR,
|
||||
}),
|
||||
);
|
||||
|
||||
const handleTimeRangeChange: (
|
||||
newTimeRange: RangeStartAndEndDateTime,
|
||||
) => void = useCallback(
|
||||
(newTimeRange: RangeStartAndEndDateTime): void => {
|
||||
setTimeRange(newTimeRange);
|
||||
setStartAndEndDate(
|
||||
RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange),
|
||||
);
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
const fetchCluster: PromiseVoidFunction = async (): Promise<void> => {
|
||||
setIsLoading(true);
|
||||
@@ -62,137 +608,6 @@ const KubernetesClusterServiceMesh: FunctionComponent<
|
||||
});
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (!cluster) {
|
||||
return;
|
||||
}
|
||||
|
||||
const clusterIdentifier: string = cluster.clusterIdentifier || "";
|
||||
const endDate: Date = OneUptimeDate.getCurrentDate();
|
||||
const startDate: Date = OneUptimeDate.addRemoveHours(endDate, -6);
|
||||
const startAndEndDate: InBetween<Date> = new InBetween(startDate, endDate);
|
||||
|
||||
// Istio metrics
|
||||
const istioRequestsTotalQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "istio_requests_total",
|
||||
title: "Istio Request Rate",
|
||||
description: "Total requests through the Istio service mesh",
|
||||
legend: "Requests",
|
||||
legendUnit: "req/s",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "istio_requests_total",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Sum,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const istioRequestDurationQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "istio_request_duration",
|
||||
title: "Istio Request Latency",
|
||||
description:
|
||||
"Request duration through the Istio service mesh (p50/p99)",
|
||||
legend: "Latency",
|
||||
legendUnit: "ms",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "istio_request_duration_milliseconds_bucket",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
// Linkerd metrics
|
||||
const linkerdRequestTotalQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "linkerd_request_total",
|
||||
title: "Linkerd Request Rate",
|
||||
description: "Total requests through the Linkerd service mesh",
|
||||
legend: "Requests",
|
||||
legendUnit: "req/s",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "request_total",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Sum,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const linkerdResponseLatencyQuery: MetricQueryConfigData = {
|
||||
metricAliasData: {
|
||||
metricVariable: "linkerd_response_latency",
|
||||
title: "Linkerd Response Latency",
|
||||
description:
|
||||
"Response latency through the Linkerd service mesh (p50/p99)",
|
||||
legend: "Latency",
|
||||
legendUnit: "ms",
|
||||
},
|
||||
metricQueryData: {
|
||||
filterData: {
|
||||
metricName: "response_latency_ms_bucket",
|
||||
attributes: {
|
||||
"resource.k8s.cluster.name": clusterIdentifier,
|
||||
},
|
||||
aggegationType: AggregationType.Avg,
|
||||
aggregateBy: {},
|
||||
},
|
||||
groupBy: {
|
||||
attributes: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
setIstioRequestsMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [istioRequestsTotalQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
|
||||
setIstioLatencyMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [istioRequestDurationQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
|
||||
setLinkerdRequestsMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [linkerdRequestTotalQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
|
||||
setLinkerdLatencyMetricViewData({
|
||||
startAndEndDate: startAndEndDate,
|
||||
queryConfigs: [linkerdResponseLatencyQuery],
|
||||
formulaConfigs: [],
|
||||
});
|
||||
}, [cluster]);
|
||||
|
||||
if (isLoading) {
|
||||
return <PageLoader isVisible={true} />;
|
||||
}
|
||||
@@ -201,71 +616,126 @@ const KubernetesClusterServiceMesh: FunctionComponent<
|
||||
return <ErrorMessage message={error} />;
|
||||
}
|
||||
|
||||
if (
|
||||
!cluster ||
|
||||
!istioRequestsMetricViewData ||
|
||||
!istioLatencyMetricViewData ||
|
||||
!linkerdRequestsMetricViewData ||
|
||||
!linkerdLatencyMetricViewData
|
||||
) {
|
||||
if (!cluster) {
|
||||
return <ErrorMessage message="Cluster not found." />;
|
||||
}
|
||||
|
||||
const clusterIdentifier: string = cluster.clusterIdentifier || "";
|
||||
|
||||
// Build all metric view data
|
||||
const istioData: MetricViewData = buildMetricViewData(
|
||||
getIstioQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const istioPilotData: MetricViewData = buildMetricViewData(
|
||||
getIstioPilotQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const envoyData: MetricViewData = buildMetricViewData(
|
||||
getEnvoyQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const linkerdData: MetricViewData = buildMetricViewData(
|
||||
getLinkerdQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
const linkerdControlPlaneData: MetricViewData = buildMetricViewData(
|
||||
getLinkerdControlPlaneQueries(clusterIdentifier),
|
||||
startAndEndDate,
|
||||
);
|
||||
|
||||
return (
|
||||
<Fragment>
|
||||
<div className="mb-4 p-4 bg-blue-50 border border-blue-200 rounded-lg">
|
||||
<p className="text-sm text-blue-700">
|
||||
Service mesh metrics require the <code>serviceMesh.enabled</code> flag
|
||||
to be set to <code>true</code> and the{" "}
|
||||
<code>serviceMesh.provider</code> to be configured in the
|
||||
kubernetes-agent Helm chart values. Supported providers are Istio and
|
||||
Linkerd.
|
||||
{/* Info banner */}
|
||||
<div className="mb-5 flex items-start gap-3 p-4 bg-blue-50 border border-blue-200 rounded-xl">
|
||||
<div className="flex-shrink-0 mt-0.5">
|
||||
<Icon icon={IconProp.Info} className="h-5 w-5 text-blue-500" />
|
||||
</div>
|
||||
<div>
|
||||
<p className="text-sm font-medium text-blue-800">
|
||||
Service Mesh Metrics Configuration
|
||||
</p>
|
||||
<p className="mt-1 text-sm text-blue-600">
|
||||
Service mesh metrics require{" "}
|
||||
<code className="px-1 py-0.5 bg-blue-100 rounded text-xs font-mono">
|
||||
serviceMesh.enabled: true
|
||||
</code>{" "}
|
||||
and{" "}
|
||||
<code className="px-1 py-0.5 bg-blue-100 rounded text-xs font-mono">
|
||||
serviceMesh.provider
|
||||
</code>{" "}
|
||||
to be configured in the kubernetes-agent Helm chart values.
|
||||
Supported providers are Istio and Linkerd. Only the sections
|
||||
matching your provider will show data.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Global time range picker */}
|
||||
<div className="mb-5 flex items-center justify-end">
|
||||
<RangeStartAndEndDateView
|
||||
dashboardStartAndEndDate={timeRange}
|
||||
onChange={handleTimeRangeChange}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* ── Istio Sections ─────────────────────────────────────────────── */}
|
||||
|
||||
<div className="mb-2">
|
||||
<h2 className="text-lg font-semibold text-gray-900">Istio</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Metrics from Istio Envoy sidecars and Pilot (istiod) control plane.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<Card
|
||||
title="Istio - Request Rate"
|
||||
description="Total request rate through Istio envoy sidecars across all services in the mesh."
|
||||
>
|
||||
<MetricView
|
||||
data={istioRequestsMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
{/* Istio Data Plane */}
|
||||
<MeshSection
|
||||
title="Data Plane — Traffic"
|
||||
description="HTTP and TCP traffic flowing through Envoy sidecar proxies. Covers request throughput, latency, payload sizes, and connection lifecycle."
|
||||
icon={IconProp.ArrowCircleRight}
|
||||
data={istioData}
|
||||
/>
|
||||
|
||||
<Card
|
||||
title="Istio - Request Latency"
|
||||
description="Request duration distribution through the Istio service mesh."
|
||||
>
|
||||
<MetricView
|
||||
data={istioLatencyMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
{/* Istio Control Plane (Pilot / istiod) */}
|
||||
<MeshSection
|
||||
title="Control Plane — Pilot (istiod)"
|
||||
description="Istio Pilot manages xDS configuration distribution to all Envoy proxies. Monitors push throughput, errors, convergence time, and listener conflicts."
|
||||
icon={IconProp.Settings}
|
||||
data={istioPilotData}
|
||||
/>
|
||||
|
||||
<Card
|
||||
title="Linkerd - Request Rate"
|
||||
description="Total request rate through Linkerd proxy sidecars across all services in the mesh."
|
||||
>
|
||||
<MetricView
|
||||
data={linkerdRequestsMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
{/* Envoy Proxy internals */}
|
||||
<MeshSection
|
||||
title="Envoy Proxy"
|
||||
description="Low-level Envoy sidecar proxy metrics. Tracks upstream connection pools, request timeouts, retries, and connection failures."
|
||||
icon={IconProp.Globe}
|
||||
data={envoyData}
|
||||
/>
|
||||
|
||||
<Card
|
||||
title="Linkerd - Response Latency"
|
||||
description="Response latency distribution through the Linkerd service mesh."
|
||||
>
|
||||
<MetricView
|
||||
data={linkerdLatencyMetricViewData}
|
||||
hideQueryElements={true}
|
||||
onChange={() => {}}
|
||||
/>
|
||||
</Card>
|
||||
{/* ── Linkerd Sections ───────────────────────────────────────────── */}
|
||||
|
||||
<div className="mb-2 mt-6">
|
||||
<h2 className="text-lg font-semibold text-gray-900">Linkerd</h2>
|
||||
<p className="text-sm text-gray-500">
|
||||
Metrics from Linkerd proxy sidecars and control plane components.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Linkerd Data Plane */}
|
||||
<MeshSection
|
||||
title="Data Plane — Traffic"
|
||||
description="Request throughput, response latency, and TCP connection metrics from Linkerd proxy sidecars."
|
||||
icon={IconProp.ArrowCircleRight}
|
||||
data={linkerdData}
|
||||
/>
|
||||
|
||||
{/* Linkerd Control Plane */}
|
||||
<MeshSection
|
||||
title="Control Plane"
|
||||
description="Linkerd control plane components: identity (mTLS certificate issuance), destination (service discovery), and proxy injector (sidecar injection)."
|
||||
icon={IconProp.Settings}
|
||||
data={linkerdControlPlaneData}
|
||||
/>
|
||||
</Fragment>
|
||||
);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user