feat: Add Kubernetes Cluster Management and Monitoring Agent

- Implemented a new migration for the KubernetesCluster and KubernetesClusterLabel tables in the database.
- Created a KubernetesClusterService for managing cluster instances, including methods for finding or creating clusters, updating their status, and marking disconnected clusters.
- Introduced a Helm chart for the OneUptime Kubernetes Monitoring Agent, including configuration files, deployment templates, and RBAC settings.
- Added support for collecting metrics and logs from Kubernetes clusters using OpenTelemetry.
- Configured service accounts, secrets, and resource limits for the agent's deployment and daemonset.
- Provided detailed notes and helper templates for the Helm chart to facilitate installation and configuration.
This commit is contained in:
Nawaz Dhandala
2026-03-17 15:29:52 +00:00
parent da6c749d96
commit bc9949abe4
39 changed files with 4106 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
# Patterns to ignore when building packages.
.DS_Store
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
*.swp
*.bak
*.tmp
*.orig
*~
.project
.idea/
*.tmproj
.vscode/

View File

@@ -0,0 +1,15 @@
apiVersion: v2
name: kubernetes-agent
description: OneUptime Kubernetes Monitoring Agent — collects cluster metrics, events, and logs via OpenTelemetry and sends them to your OneUptime instance.
icon: https://raw.githubusercontent.com/OneUptime/oneuptime/master/Home/Static/img/OneUptimePNG/1.png
type: application
version: 0.1.0
appVersion: "1.0.0"
annotations:
artifacthub.io/license: MIT
artifacthub.io/category: monitoring-logging
artifacthub.io/prerelease: "false"

View File

@@ -0,0 +1,24 @@
OneUptime Kubernetes Agent has been installed.
Cluster Name: {{ .Values.clusterName }}
OneUptime URL: {{ .Values.oneuptime.url }}
The agent is now collecting:
- Node, pod, and container resource metrics (kubeletstats)
- Cluster-level metrics: deployments, replicas, pod phases (k8s_cluster)
- Kubernetes events (k8sobjects)
{{- if .Values.controlPlane.enabled }}
- Control plane metrics: etcd, API server, scheduler, controller manager (prometheus)
{{- end }}
{{- if .Values.logs.enabled }}
- Pod logs from /var/log/pods (filelog DaemonSet)
{{- end }}
To verify the agent is running:
kubectl get pods -n {{ .Release.Namespace }} -l app.kubernetes.io/name={{ include "kubernetes-agent.name" . }}
To check collector logs:
kubectl logs -n {{ .Release.Namespace }} -l app.kubernetes.io/name={{ include "kubernetes-agent.name" . }} -c otel-collector
Your cluster should appear in OneUptime within a few minutes at:
{{ .Values.oneuptime.url }}/dashboard/<project-id>/kubernetes

View File

@@ -0,0 +1,59 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "kubernetes-agent.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
*/}}
{{- define "kubernetes-agent.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "kubernetes-agent.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "kubernetes-agent.labels" -}}
helm.sh/chart: {{ include "kubernetes-agent.chart" . }}
{{ include "kubernetes-agent.selectorLabels" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
app.kubernetes.io/part-of: oneuptime
{{- end }}
{{/*
Selector labels
*/}}
{{- define "kubernetes-agent.selectorLabels" -}}
app.kubernetes.io/name: {{ include "kubernetes-agent.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Service account name
*/}}
{{- define "kubernetes-agent.serviceAccountName" -}}
{{- if .Values.serviceAccount.name }}
{{- .Values.serviceAccount.name }}
{{- else }}
{{- include "kubernetes-agent.fullname" . }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,134 @@
{{- if .Values.logs.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "kubernetes-agent.fullname" . }}-daemonset
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
data:
otel-collector-config.yaml: |
receivers:
# Collect pod logs from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude the agent's own logs to avoid feedback loop
- /var/log/pods/{{ "{{ .Release.Namespace }}" }}_{{ "{{ include \"kubernetes-agent.fullname\" . }}" }}*/**/*.log
start_at: end
include_file_path: true
include_file_name: false
operators:
# Parse CRI log format
- type: router
id: get-format
routes:
- output: parser-docker
expr: 'body matches "^\\{"'
- output: parser-cri
expr: 'body matches "^[^ Z]+ "'
- output: parser-containerd
expr: 'body matches "^[^ Z]+Z"'
# Docker JSON log format
- type: json_parser
id: parser-docker
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# CRI log format
- type: regex_parser
id: parser-cri
regex: '^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%L%j'
# Containerd log format
- type: regex_parser
id: parser-containerd
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
output: extract-metadata-from-filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Extract k8s metadata from file path
- type: regex_parser
id: extract-metadata-from-filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
- type: move
from: attributes.log
to: body
- type: move
from: attributes.stream
to: attributes["log.iostream"]
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.uid
to: resource["k8s.pod.uid"]
processors:
# Enrich with K8s metadata
k8sattributes:
auth_type: serviceAccount
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.namespace.name
- k8s.node.name
- k8s.deployment.name
- k8s.replicaset.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.container.name
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.uid
# Stamp with cluster name
resource:
attributes:
- key: k8s.cluster.name
value: {{ .Values.clusterName | quote }}
action: upsert
batch:
send_batch_size: 1024
timeout: 10s
memory_limiter:
check_interval: 5s
limit_mib: 200
spike_limit_mib: 50
exporters:
otlphttp:
endpoint: "{{ .Values.oneuptime.url }}"
headers:
x-oneuptime-token: "${env:ONEUPTIME_API_KEY}"
service:
pipelines:
logs:
receivers:
- filelog
processors:
- memory_limiter
- k8sattributes
- resource
- batch
exporters:
- otlphttp
{{- end }}

View File

@@ -0,0 +1,176 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "kubernetes-agent.fullname" . }}-deployment
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
data:
otel-collector-config.yaml: |
receivers:
# Collect node, pod, and container resource metrics from kubelet
kubeletstats:
collection_interval: {{ .Values.collectionInterval }}
auth_type: serviceAccount
endpoint: "https://${env:NODE_NAME}:10250"
insecure_skip_verify: true
metric_groups:
- node
- pod
- container
extra_metadata_labels:
- container.id
k8s_api_config:
auth_type: serviceAccount
# Collect cluster-level metrics from the Kubernetes API
k8s_cluster:
collection_interval: {{ .Values.collectionInterval }}
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- storage
# Watch Kubernetes events and ingest as logs
k8sobjects:
objects:
- name: events
mode: watch
group: events.k8s.io
{{- if .Values.controlPlane.enabled }}
# Scrape control plane metrics via Prometheus endpoints
prometheus:
config:
scrape_configs:
- job_name: etcd
scheme: https
tls_config:
insecure_skip_verify: {{ .Values.controlPlane.etcd.insecureSkipVerify }}
static_configs:
{{- range .Values.controlPlane.etcd.endpoints }}
- targets:
- {{ . | quote }}
{{- end }}
- job_name: kube-apiserver
scheme: https
tls_config:
insecure_skip_verify: {{ .Values.controlPlane.apiServer.insecureSkipVerify }}
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
static_configs:
{{- range .Values.controlPlane.apiServer.endpoints }}
- targets:
- {{ . | quote }}
{{- end }}
- job_name: kube-scheduler
scheme: https
tls_config:
insecure_skip_verify: {{ .Values.controlPlane.scheduler.insecureSkipVerify }}
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
static_configs:
{{- range .Values.controlPlane.scheduler.endpoints }}
- targets:
- {{ . | quote }}
{{- end }}
- job_name: kube-controller-manager
scheme: https
tls_config:
insecure_skip_verify: {{ .Values.controlPlane.controllerManager.insecureSkipVerify }}
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
static_configs:
{{- range .Values.controlPlane.controllerManager.endpoints }}
- targets:
- {{ . | quote }}
{{- end }}
{{- end }}
processors:
# Enrich all telemetry with Kubernetes metadata
k8sattributes:
auth_type: serviceAccount
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.namespace.name
- k8s.node.name
- k8s.deployment.name
- k8s.replicaset.name
- k8s.statefulset.name
- k8s.daemonset.name
- k8s.job.name
- k8s.cronjob.name
- k8s.container.name
labels:
- tag_name: k8s.pod.label.app
key: app
from: pod
- tag_name: k8s.pod.label.app.kubernetes.io/name
key: app.kubernetes.io/name
from: pod
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: connection
# Stamp all telemetry with the cluster name
resource:
attributes:
- key: k8s.cluster.name
value: {{ .Values.clusterName | quote }}
action: upsert
# Batch telemetry for efficient export
batch:
send_batch_size: 1024
timeout: 10s
# Limit memory usage
memory_limiter:
check_interval: 5s
limit_mib: 400
spike_limit_mib: 100
exporters:
otlphttp:
endpoint: "{{ .Values.oneuptime.url }}"
headers:
x-oneuptime-token: "${env:ONEUPTIME_API_KEY}"
service:
pipelines:
metrics:
receivers:
- kubeletstats
- k8s_cluster
{{- if .Values.controlPlane.enabled }}
- prometheus
{{- end }}
processors:
- memory_limiter
- k8sattributes
- resource
- batch
exporters:
- otlphttp
logs:
receivers:
- k8sobjects
processors:
- memory_limiter
- k8sattributes
- resource
- batch
exporters:
- otlphttp

View File

@@ -0,0 +1,56 @@
{{- if .Values.logs.enabled }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "kubernetes-agent.fullname" . }}-logs
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
component: log-collector
spec:
selector:
matchLabels:
{{- include "kubernetes-agent.selectorLabels" . | nindent 6 }}
component: log-collector
template:
metadata:
labels:
{{- include "kubernetes-agent.selectorLabels" . | nindent 8 }}
component: log-collector
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap-daemonset.yaml") . | sha256sum }}
spec:
serviceAccountName: {{ include "kubernetes-agent.serviceAccountName" . }}
containers:
- name: otel-collector
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args:
- "--config=/etc/otel/otel-collector-config.yaml"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: ONEUPTIME_API_KEY
valueFrom:
secretKeyRef:
name: {{ include "kubernetes-agent.fullname" . }}
key: api-key
resources:
{{- toYaml .Values.logs.resources | nindent 12 }}
volumeMounts:
- name: config
mountPath: /etc/otel
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
volumes:
- name: config
configMap:
name: {{ include "kubernetes-agent.fullname" . }}-daemonset
- name: varlogpods
hostPath:
path: /var/log/pods
{{- end }}

View File

@@ -0,0 +1,67 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "kubernetes-agent.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
component: metrics-collector
spec:
replicas: {{ .Values.deployment.replicas }}
selector:
matchLabels:
{{- include "kubernetes-agent.selectorLabels" . | nindent 6 }}
component: metrics-collector
template:
metadata:
labels:
{{- include "kubernetes-agent.selectorLabels" . | nindent 8 }}
component: metrics-collector
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap-deployment.yaml") . | sha256sum }}
spec:
serviceAccountName: {{ include "kubernetes-agent.serviceAccountName" . }}
containers:
- name: otel-collector
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args:
- "--config=/etc/otel/otel-collector-config.yaml"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: CLUSTER_NAME
value: {{ .Values.clusterName | quote }}
- name: ONEUPTIME_API_KEY
valueFrom:
secretKeyRef:
name: {{ include "kubernetes-agent.fullname" . }}
key: api-key
ports:
- name: health
containerPort: 13133
protocol: TCP
livenessProbe:
httpGet:
path: /
port: health
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: health
initialDelaySeconds: 5
periodSeconds: 10
resources:
{{- toYaml .Values.deployment.resources | nindent 12 }}
volumeMounts:
- name: config
mountPath: /etc/otel
readOnly: true
volumes:
- name: config
configMap:
name: {{ include "kubernetes-agent.fullname" . }}-deployment

View File

@@ -0,0 +1,88 @@
{{- if .Values.serviceAccount.create }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "kubernetes-agent.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "kubernetes-agent.fullname" . }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
rules:
# For k8s_cluster receiver and k8sattributes processor
- apiGroups: [""]
resources:
- pods
- nodes
- nodes/proxy
- nodes/stats
- services
- endpoints
- namespaces
- events
- replicationcontrollers
- resourcequotas
- limitranges
- configmaps
- persistentvolumeclaims
- persistentvolumes
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources:
- deployments
- replicasets
- statefulsets
- daemonsets
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources:
- jobs
- cronjobs
verbs: ["get", "list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["get", "list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources:
- ingresses
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources:
- ingresses
verbs: ["get", "list", "watch"]
# For k8sobjects receiver to watch events
- apiGroups: ["events.k8s.io"]
resources:
- events
verbs: ["get", "list", "watch"]
# For kubeletstats receiver
- nonResourceURLs:
- /metrics
- /metrics/cadvisor
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "kubernetes-agent.fullname" . }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "kubernetes-agent.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ include "kubernetes-agent.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}

View File

@@ -0,0 +1,10 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "kubernetes-agent.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "kubernetes-agent.labels" . | nindent 4 }}
type: Opaque
data:
api-key: {{ .Values.oneuptime.apiKey | b64enc | quote }}

View File

@@ -0,0 +1,80 @@
# OneUptime Kubernetes Agent Configuration
# Required: Your OneUptime instance connection details
oneuptime:
# URL of your OneUptime instance (e.g., https://oneuptime.example.com)
url: ""
# Project API key from OneUptime (Settings > API Keys)
apiKey: ""
# Required: Unique name for this cluster (used as k8s.cluster.name attribute)
clusterName: ""
# Namespace filters — limit which namespaces are monitored
namespaceFilters:
# If set, only these namespaces are monitored (empty = all namespaces)
include: []
# Namespaces to exclude from monitoring
exclude:
- kube-system
# OTel Collector image configuration
image:
repository: otel/opentelemetry-collector-contrib
tag: "0.96.0"
pullPolicy: IfNotPresent
# Deployment (metrics + events collector) resource configuration
deployment:
replicas: 1
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# Control plane monitoring (etcd, API server, scheduler, controller manager)
# Disabled by default — enable for self-managed clusters.
# Managed K8s (EKS, GKE, AKS) typically don't expose control plane metrics.
controlPlane:
enabled: false
etcd:
# Endpoints to scrape etcd metrics from
endpoints:
- https://localhost:2379/metrics
# TLS configuration for etcd
insecureSkipVerify: true
apiServer:
endpoints:
- https://localhost:6443/metrics
insecureSkipVerify: true
scheduler:
endpoints:
- https://localhost:10259/metrics
insecureSkipVerify: true
controllerManager:
endpoints:
- https://localhost:10257/metrics
insecureSkipVerify: true
# Pod log collection via DaemonSet with filelog receiver
logs:
enabled: true
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Collection intervals
collectionInterval: 30s
# Service account configuration
serviceAccount:
create: true
name: ""
annotations: {}