feat: add KEDA autoscaling support for OpenTelemetry Ingest with configurable metrics

This commit is contained in:
Simon Larsen
2025-07-31 12:18:57 +01:00
parent b77ef336b8
commit 59d76b601a
3 changed files with 83 additions and 1 deletions

View File

@@ -701,3 +701,62 @@ spec:
requests:
storage: {{ $.Storage }}
{{- end }}
{{/*
KEDA ScaledObject template for metric-based autoscaling
Usage: include "oneuptime.kedaScaledObject" (dict "ServiceName" "service-name" "Release" .Release "Values" .Values "MetricsConfig" {...})
*/}}
{{- define "oneuptime.kedaScaledObject" }}
{{- if and .Values.keda.enabled .MetricsConfig.enabled (not .DisableAutoscaler) }}
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: {{ printf "%s-%s-scaledobject" .Release.Name .ServiceName }}
namespace: {{ .Release.Namespace }}
labels:
app: {{ printf "%s-%s" .Release.Name .ServiceName }}
app.kubernetes.io/part-of: oneuptime
app.kubernetes.io/managed-by: Helm
appname: oneuptime
spec:
scaleTargetRef:
name: {{ printf "%s-%s" .Release.Name .ServiceName }}
minReplicaCount: {{ .MetricsConfig.minReplicas }}
maxReplicaCount: {{ .MetricsConfig.maxReplicas }}
pollingInterval: {{ .MetricsConfig.pollingInterval }}
cooldownPeriod: {{ .MetricsConfig.cooldownPeriod }}
triggers:
{{- range .MetricsConfig.triggers }}
- type: prometheus
metadata:
serverAddress: http://{{ printf "%s-%s" $.Release.Name $.ServiceName }}:{{ .port }}/metrics
query: {{ .query }}
threshold: {{ .threshold | quote }}
authenticationRef:
name: {{ printf "%s-%s-trigger-auth" $.Release.Name $.ServiceName }}
{{- end }}
---
apiVersion: keda.sh/v1alpha1
kind: TriggerAuthentication
metadata:
name: {{ printf "%s-%s-trigger-auth" .Release.Name .ServiceName }}
namespace: {{ .Release.Namespace }}
labels:
app: {{ printf "%s-%s" .Release.Name .ServiceName }}
app.kubernetes.io/part-of: oneuptime
app.kubernetes.io/managed-by: Helm
appname: oneuptime
spec:
secretTargetRef:
{{- if .Values.oneuptimeSecret }}
- parameter: X-Cluster-Key
name: {{ printf "%s-%s" .Release.Name "secrets" }}
key: oneuptime-secret
{{- else if .Values.externalSecrets.oneuptimeSecret.existingSecret.name }}
- parameter: X-Cluster-Key
name: {{ .Values.externalSecrets.oneuptimeSecret.existingSecret.name }}
key: {{ .Values.externalSecrets.oneuptimeSecret.existingSecret.passwordKey }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -112,8 +112,20 @@ spec:
---
# OneUptime open-telemetry-ingest autoscaler
{{- if not $.Values.openTelemetryIngest.disableAutoscaler }}
{{- if and (not $.Values.openTelemetryIngest.disableAutoscaler) (not (and $.Values.keda.enabled $.Values.openTelemetryIngest.keda.enabled)) }}
{{- $openTelemetryIngestAutoScalerArgs := dict "ServiceName" "open-telemetry-ingest" "Release" $.Release "Values" $.Values -}}
{{- include "oneuptime.autoscaler" $openTelemetryIngestAutoScalerArgs }}
{{- end }}
---
{{/*
KEDA ScaledObjects for various services
*/}}
{{/* OpenTelemetry Ingest KEDA ScaledObject */}}
{{- if and .Values.keda.enabled .Values.openTelemetryIngest.keda.enabled (not .Values.openTelemetryIngest.disableAutoscaler) }}
{{- $metricsConfig := dict "enabled" .Values.openTelemetryIngest.keda.enabled "minReplicas" .Values.openTelemetryIngest.keda.minReplicas "maxReplicas" .Values.openTelemetryIngest.keda.maxReplicas "pollingInterval" .Values.openTelemetryIngest.keda.pollingInterval "cooldownPeriod" .Values.openTelemetryIngest.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_telemetry_queue_size" "threshold" .Values.openTelemetryIngest.keda.queueSizeThreshold "port" .Values.port.openTelemetryIngest)) }}
{{- $openTelemetryIngestKedaArgs := dict "ServiceName" "open-telemetry-ingest" "Release" .Release "Values" .Values "MetricsConfig" $metricsConfig "DisableAutoscaler" .Values.openTelemetryIngest.disableAutoscaler }}
{{- include "oneuptime.kedaScaledObject" $openTelemetryIngestKedaArgs }}
{{- end }}
---

View File

@@ -496,6 +496,17 @@ openTelemetryIngest:
disableTelemetryCollection: false
disableAutoscaler: false
resources:
# KEDA autoscaling configuration based on queue metrics
keda:
enabled: false
minReplicas: 1
maxReplicas: 100
# Scale up when queue size exceeds this threshold
queueSizeThreshold: 100
# Polling interval for metrics (in seconds)
pollingInterval: 30
# Cooldown period after scaling (in seconds)
cooldownPeriod: 300
fluentIngest:
replicaCount: 1