Add utility classes for telemetry: Monitor, StackTrace, and Syslog parsing

- Implemented MonitorUtil for managing monitor secrets and populating them in monitor steps and tests.
- Created StackTraceParser to parse and structure stack traces from various programming languages.
- Developed SyslogParser to handle and parse syslog messages in both RFC 5424 and RFC 3164 formats.
This commit is contained in:
Nawaz Dhandala
2026-04-02 14:04:13 +01:00
parent 69c6b332c1
commit 5f398bdb31
99 changed files with 125 additions and 8756 deletions

View File

@@ -117,13 +117,13 @@ Usage:
- name: SERVER_APP_HOSTNAME
value: {{ $.Release.Name }}-app.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}
- name: TELEMETRY_HOSTNAME
value: {{ $.Release.Name }}-telemetry.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}
value: {{ $.Release.Name }}-app.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}
- name: SERVER_TELEMETRY_HOSTNAME
value: {{ $.Release.Name }}-telemetry.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}
value: {{ $.Release.Name }}-app.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}
- name: APP_PORT
value: {{ $.Values.app.ports.http | squote }}
- name: TELEMETRY_PORT
value: {{ $.Values.telemetry.ports.http | squote }}
value: {{ $.Values.app.ports.http | squote }}
- name: HOME_PORT
value: {{ $.Values.home.ports.http | squote }}
- name: WORKER_CONCURRENCY

View File

@@ -118,11 +118,16 @@ spec:
value: {{ $.Values.app.disableTelemetryCollection | quote }}
- name: ENABLE_PROFILING
value: {{ $.Values.app.enableProfiling | quote }}
- name: TELEMETRY_CONCURRENCY
value: {{ $.Values.app.telemetryConcurrency | default 100 | squote }}
{{- include "oneuptime.env.registerProbeKey" (dict "Values" $.Values "Release" $.Release) | nindent 12 }}
ports:
- containerPort: {{ $.Values.app.ports.http }}
protocol: TCP
name: http
- containerPort: {{ $.Values.app.ports.grpc | default 4317 }}
protocol: TCP
name: grpc
{{- if $.Values.app.resources }}
resources:
{{- toYaml $.Values.app.resources | nindent 12 }}
@@ -141,7 +146,7 @@ spec:
{{- if and $.Values.app.enabled (not $.Values.deployment.disableDeployments) }}
# OneUptime app Service
{{- $appPorts := dict "port" $.Values.app.ports.http -}}
{{- $appPorts := dict "http" $.Values.app.ports.http "grpc" ($.Values.app.ports.grpc | default 4317) -}}
{{- $appServiceArgs := dict "ServiceName" "app" "Ports" $appPorts "Release" $.Release "Values" $.Values -}}
{{- include "oneuptime.service" $appServiceArgs }}
---

View File

@@ -2,13 +2,6 @@
KEDA ScaledObjects for various services
*/}}
{{/* Telemetry KEDA ScaledObject */}}
{{- if and .Values.keda.enabled .Values.telemetry.enabled .Values.telemetry.keda.enabled (not .Values.telemetry.disableAutoscaler) (not .Values.deployment.disableDeployments) }}
{{- $metricsConfig := dict "enabled" .Values.telemetry.keda.enabled "minReplicas" .Values.telemetry.keda.minReplicas "maxReplicas" .Values.telemetry.keda.maxReplicas "pollingInterval" .Values.telemetry.keda.pollingInterval "cooldownPeriod" .Values.telemetry.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_telemetry_queue_size" "threshold" .Values.telemetry.keda.queueSizeThreshold "port" .Values.telemetry.ports.http)) }}
{{- $telemetryKedaArgs := dict "ServiceName" "telemetry" "Release" .Release "Values" .Values "MetricsConfig" $metricsConfig "DisableAutoscaler" .Values.telemetry.disableAutoscaler }}
{{- include "oneuptime.kedaScaledObject" $telemetryKedaArgs }}
{{- end }}
{{/* Probe KEDA ScaledObjects - one for each probe configuration */}}
{{- range $key, $val := $.Values.probes }}
{{- $probeEnabled := or (not (hasKey $val "enabled")) $val.enabled }}

View File

@@ -1 +1 @@
{{- /* OTel Collector has been removed. Telemetry ingestion (gRPC + HTTP) is now handled directly by the telemetry service. */ -}}
{{- /* OTel Collector has been removed. Telemetry ingestion (gRPC + HTTP) is now handled directly by the app service. */ -}}

View File

@@ -86,7 +86,7 @@ spec:
- name: OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT
value: {{ $.Values.openTelemetryExporter.endpoint }}
- name: ONEUPTIME_URL
value: http://{{ $.Release.Name }}-telemetry.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}:{{ $.Values.telemetry.ports.http }}
value: http://{{ $.Release.Name }}-app.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }}:{{ $.Values.app.ports.http }}
- name: PROBE_NAME
value: {{ $val.name }}
- name: PROBE_DESCRIPTION

View File

@@ -1,148 +0,0 @@
{{- if and $.Values.telemetry.enabled (not $.Values.deployment.disableDeployments) }}
# OneUptime telemetry Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ printf "%s-%s" $.Release.Name "telemetry" }}
namespace: {{ $.Release.Namespace }}
labels:
app: {{ printf "%s-%s" $.Release.Name "telemetry" }}
app.kubernetes.io/part-of: oneuptime
app.kubernetes.io/managed-by: Helm
appname: oneuptime
{{- if $.Values.deployment.includeTimestampLabel }}
date: "{{ now | unixEpoch }}"
{{- end }}
spec:
selector:
matchLabels:
app: {{ printf "%s-%s" $.Release.Name "telemetry" }}
{{- if $.Values.telemetry.replicaCount }}
replicas: {{ $.Values.telemetry.replicaCount }}
{{- else }}
{{- if or (not $.Values.autoscaling.enabled) ($.Values.telemetry.disableAutoscaler) }}
replicas: {{ $.Values.deployment.replicaCount }}
{{- end }}
{{- end }}
strategy: {{- toYaml $.Values.deployment.updateStrategy | nindent 4 }}
template:
metadata:
labels:
app: {{ printf "%s-%s" $.Release.Name "telemetry" }}
{{- if $.Values.deployment.includeTimestampLabel }}
date: "{{ now | unixEpoch }}"
{{- end }}
appname: oneuptime
spec:
volumes:
- name: greenlockrc
emptyDir:
sizeLimit: "1Gi"
{{- if $.Values.telemetry.podSecurityContext }}
securityContext:
{{- toYaml $.Values.telemetry.podSecurityContext | nindent 8 }}
{{- else if $.Values.podSecurityContext }}
securityContext:
{{- toYaml $.Values.podSecurityContext | nindent 8 }}
{{- end }}
{{- if $.Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml $.Values.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if $.Values.affinity }}
affinity: {{- $.Values.affinity | toYaml | nindent 8 }}
{{- end }}
{{- if $.Values.tolerations }}
tolerations: {{- $.Values.tolerations | toYaml | nindent 8 }}
{{- end }}
{{- if $.Values.telemetry.nodeSelector }}
nodeSelector:
{{- toYaml $.Values.telemetry.nodeSelector | nindent 8 }}
{{- else if $.Values.nodeSelector }}
nodeSelector:
{{- toYaml $.Values.nodeSelector | nindent 8 }}
{{- end }}
containers:
- image: {{ include "oneuptime.image" (dict "Values" $.Values "ServiceName" "telemetry") }}
name: {{ printf "%s-%s" $.Release.Name "telemetry" }}
{{- if $.Values.startupProbe.enabled }}
# Startup probe
startupProbe:
httpGet:
path: /status/live
port: {{ $.Values.telemetry.ports.http }}
periodSeconds: {{ $.Values.startupProbe.periodSeconds }}
failureThreshold: {{ $.Values.startupProbe.failureThreshold }}
{{- end }}
{{- if $.Values.livenessProbe.enabled }}
# Liveness probe
livenessProbe:
httpGet:
path: /status/live
port: {{ $.Values.telemetry.ports.http }}
periodSeconds: {{ $.Values.livenessProbe.periodSeconds }}
timeoutSeconds: {{ $.Values.livenessProbe.timeoutSeconds }}
initialDelaySeconds: {{ $.Values.livenessProbe.initialDelaySeconds }}
{{- end }}
{{- if $.Values.readinessProbe.enabled }}
# Readyness Probe
readinessProbe:
httpGet:
path: /status/ready
port: {{ $.Values.telemetry.ports.http }}
periodSeconds: {{ $.Values.readinessProbe.periodSeconds }}
initialDelaySeconds: {{ $.Values.readinessProbe.initialDelaySeconds }}
timeoutSeconds: {{ $.Values.readinessProbe.timeoutSeconds }}
{{- end }}
{{- if $.Values.telemetry.containerSecurityContext }}
securityContext:
{{- toYaml $.Values.telemetry.containerSecurityContext | nindent 12 }}
{{- else if $.Values.containerSecurityContext }}
securityContext:
{{- toYaml $.Values.containerSecurityContext | nindent 12 }}
{{- end }}
imagePullPolicy: {{ $.Values.image.pullPolicy }}
env:
{{- include "oneuptime.env.common" . | nindent 12 }}
{{- include "oneuptime.env.runtime" (dict "Values" $.Values "Release" $.Release) | nindent 12 }}
- name: PORT
value: {{ $.Values.telemetry.ports.http | quote }}
- name: DISABLE_TELEMETRY
value: {{ $.Values.telemetry.disableTelemetryCollection | quote }}
- name: ENABLE_PROFILING
value: {{ $.Values.telemetry.enableProfiling | quote }}
- name: TELEMETRY_CONCURRENCY
value: {{ $.Values.telemetry.concurrency | squote }}
{{- include "oneuptime.env.registerProbeKey" (dict "Values" $.Values "Release" $.Release) | nindent 12 }}
ports:
- containerPort: {{ $.Values.telemetry.ports.http }}
protocol: TCP
name: http
- containerPort: {{ $.Values.telemetry.ports.grpc }}
protocol: TCP
name: grpc
{{- if $.Values.telemetry.resources }}
resources:
{{- toYaml $.Values.telemetry.resources | nindent 12 }}
{{- end }}
restartPolicy: {{ $.Values.image.restartPolicy }}
---
# OneUptime telemetry autoscaler
{{- if and (not $.Values.telemetry.disableAutoscaler) (not (and $.Values.keda.enabled $.Values.telemetry.keda.enabled)) }}
{{- $telemetryAutoScalerArgs := dict "ServiceName" "telemetry" "Release" $.Release "Values" $.Values -}}
{{- include "oneuptime.autoscaler" $telemetryAutoScalerArgs }}
{{- end }}
{{- end }}
---
{{- if and $.Values.telemetry.enabled (not $.Values.deployment.disableDeployments) }}
# OneUptime telemetry Service
{{- $telemetryPorts := dict "http" $.Values.telemetry.ports.http "grpc" $.Values.telemetry.ports.grpc -}}
{{- $telemetryServiceArgs := dict "ServiceName" "telemetry" "Ports" $telemetryPorts "Release" $.Release "Values" $.Values -}}
{{- include "oneuptime.service" $telemetryServiceArgs }}
---
{{- end }}

View File

@@ -1755,49 +1755,7 @@
"workerConcurrency": {
"type": "integer"
},
"disableTelemetryCollection": {
"type": "boolean"
},
"disableAutoscaler": {
"type": "boolean"
},
"ports": {
"type": "object",
"properties": {
"http": {
"type": "integer"
}
},
"additionalProperties": false
},
"resources": {
"type": [
"object",
"null"
]
},
"nodeSelector": {
"type": "object"
},
"podSecurityContext": {
"type": "object"
},
"containerSecurityContext": {
"type": "object"
},
"enableProfiling": {
"type": "boolean"
}
},
"additionalProperties": false
},
"telemetry": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"replicaCount": {
"telemetryConcurrency": {
"type": "integer"
},
"disableTelemetryCollection": {
@@ -1806,9 +1764,6 @@
"disableAutoscaler": {
"type": "boolean"
},
"concurrency": {
"type": "integer"
},
"ports": {
"type": "object",
"properties": {
@@ -1816,7 +1771,8 @@
"type": "integer"
},
"grpc": {
"type": "integer"
"type": "integer",
"default": 4317
}
},
"additionalProperties": false
@@ -1836,30 +1792,6 @@
"containerSecurityContext": {
"type": "object"
},
"keda": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"minReplicas": {
"type": "integer"
},
"maxReplicas": {
"type": "integer"
},
"queueSizeThreshold": {
"type": "integer"
},
"pollingInterval": {
"type": "integer"
},
"cooldownPeriod": {
"type": "integer"
}
},
"additionalProperties": false
},
"enableProfiling": {
"type": "boolean"
}

View File

@@ -650,42 +650,18 @@ app:
enabled: true
replicaCount: 1
workerConcurrency: 100
# Max concurrent telemetry ingestion jobs processed by each pod
telemetryConcurrency: 100
disableTelemetryCollection: false
enableProfiling: false
disableAutoscaler: false
ports:
http: 3002
resources:
nodeSelector: {}
podSecurityContext: {}
containerSecurityContext: {}
telemetry:
enabled: true
replicaCount: 1
disableTelemetryCollection: false
enableProfiling: false
disableAutoscaler: false
# Max concurrent telemetry jobs processed by each pod
concurrency: 100
ports:
http: 3403
grpc: 4317
resources:
nodeSelector: {}
podSecurityContext: {}
containerSecurityContext: {}
# KEDA autoscaling configuration based on queue metrics
keda:
enabled: false
minReplicas: 1
maxReplicas: 100
# Scale up when queue size exceeds this threshold
queueSizeThreshold: 100
# Polling interval for metrics (in seconds)
pollingInterval: 30
# Cooldown period after scaling (in seconds)
cooldownPeriod: 300
# AI Agent Configuration
# Deploy this to run an AI Agent within your Kubernetes cluster