feat: add KEDA autoscaling configuration for worker and telemetry queue metrics

2026-04-06 00:32:12 +02:00 · 2026-04-02 14:20:57 +01:00
parent 6f7907102b
commit 407d4e3687
5 changed files with 52 additions and 3 deletions
--- a/HelmChart/Public/oneuptime/templates/_helpers.tpl
+++ b/HelmChart/Public/oneuptime/templates/_helpers.tpl
@@ -832,7 +832,7 @@ spec:
    - type: metrics-api
      metadata:
        targetValue: {{ .threshold | quote }}
-        url: http://{{ printf "%s-%s" $.Release.Name $.ServiceName }}:{{ .port }}/metrics/queue-size
+        url: http://{{ printf "%s-%s" $.Release.Name $.ServiceName }}:{{ .port }}{{ if .urlPath }}{{ .urlPath }}{{ else }}/metrics/queue-size{{ end }}
        valueLocation: 'queueSize'
        method: 'GET'
      # authenticationRef:
--- a/HelmChart/Public/oneuptime/templates/app.yaml
+++ b/HelmChart/Public/oneuptime/templates/app.yaml
@@ -17,6 +17,7 @@ spec:
  selector:
    matchLabels:
      app: {{ printf "%s-%s" $.Release.Name "app"  }}
+  {{- if not (and $.Values.keda.enabled $.Values.app.keda.enabled) }}
  {{- if and $.Values.app (hasKey $.Values.app "replicaCount") (ne $.Values.app.replicaCount nil) }}
  replicas: {{ $.Values.app.replicaCount }}
  {{- else }}
@@ -24,6 +25,7 @@ spec:
  replicas: {{ $.Values.deployment.replicaCount }}
  {{- end }}
  {{- end }}
+  {{- end }}
  strategy: {{- toYaml $.Values.deployment.updateStrategy | nindent 4 }}
  template:
    metadata:
@@ -136,8 +138,8 @@ spec:

 ---

-# OneUptime app autoscaler
-{{- if not $.Values.app.disableAutoscaler }}
+# OneUptime app autoscaler (skip HPA when KEDA is managing scaling)
+{{- if and (not $.Values.app.disableAutoscaler) (not (and $.Values.keda.enabled $.Values.app.keda.enabled)) }}
 {{- $appAutoScalerArgs := dict "ServiceName" "app" "Release" $.Release "Values" $.Values -}}
 {{- include "oneuptime.autoscaler" $appAutoScalerArgs }}
 {{- end }}
--- a/HelmChart/Public/oneuptime/templates/keda-scaledobjects.yaml
+++ b/HelmChart/Public/oneuptime/templates/keda-scaledobjects.yaml
@@ -17,6 +17,13 @@ KEDA ScaledObjects for various services
 {{- end }}
 {{- end }}

+{{/* App KEDA ScaledObject - scales based on worker and telemetry queue sizes */}}
+{{- if and .Values.keda.enabled .Values.app.enabled .Values.app.keda.enabled (not .Values.app.disableAutoscaler) (not .Values.deployment.disableDeployments) }}
+{{- $metricsConfig := dict "enabled" .Values.app.keda.enabled "minReplicas" .Values.app.keda.minReplicas "maxReplicas" .Values.app.keda.maxReplicas "pollingInterval" .Values.app.keda.pollingInterval "cooldownPeriod" .Values.app.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_app_worker_queue_size" "threshold" .Values.app.keda.workerQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/worker/metrics/queue-size") (dict "query" "oneuptime_app_telemetry_queue_size" "threshold" .Values.app.keda.telemetryQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/telemetry/metrics/queue-size")) }}
+{{- $appKedaArgs := dict "ServiceName" "app" "Release" .Release "Values" .Values "MetricsConfig" $metricsConfig "DisableAutoscaler" .Values.app.disableAutoscaler }}
+{{- include "oneuptime.kedaScaledObject" $appKedaArgs }}
+{{- end }}
+
 {{/* AI Agent KEDA ScaledObject */}}
 {{- if and .Values.keda.enabled .Values.aiAgent.enabled .Values.aiAgent.keda.enabled (not .Values.aiAgent.disableAutoscaler) (not .Values.deployment.disableDeployments) }}
 {{- $metricsConfig := dict "enabled" .Values.aiAgent.keda.enabled "minReplicas" .Values.aiAgent.keda.minReplicas "maxReplicas" .Values.aiAgent.keda.maxReplicas "pollingInterval" .Values.aiAgent.keda.pollingInterval "cooldownPeriod" .Values.aiAgent.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_ai_agent_queue_size" "threshold" .Values.aiAgent.keda.queueSizeThreshold "port" .Values.aiAgent.ports.http)) }}
--- a/HelmChart/Public/oneuptime/values.schema.json
+++ b/HelmChart/Public/oneuptime/values.schema.json
@@ -1794,6 +1794,33 @@
                },
                "enableProfiling": {
                    "type": "boolean"
+                },
+                "keda": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "minReplicas": {
+                            "type": "integer"
+                        },
+                        "maxReplicas": {
+                            "type": "integer"
+                        },
+                        "workerQueueSizeThreshold": {
+                            "type": "integer"
+                        },
+                        "telemetryQueueSizeThreshold": {
+                            "type": "integer"
+                        },
+                        "pollingInterval": {
+                            "type": "integer"
+                        },
+                        "cooldownPeriod": {
+                            "type": "integer"
+                        }
+                    },
+                    "additionalProperties": false
                }
            },
            "additionalProperties": false
--- a/HelmChart/Public/oneuptime/values.yaml
+++ b/HelmChart/Public/oneuptime/values.yaml
@@ -662,6 +662,19 @@ app:
  nodeSelector: {}
  podSecurityContext: {}
  containerSecurityContext: {}
+  # KEDA autoscaling configuration based on worker and telemetry queue metrics
+  keda:
+    enabled: false
+    minReplicas: 1
+    maxReplicas: 100
+    # Scale up when worker queue size exceeds this threshold
+    workerQueueSizeThreshold: 10
+    # Scale up when telemetry queue size exceeds this threshold
+    telemetryQueueSizeThreshold: 10
+    # Polling interval for metrics (in seconds)
+    pollingInterval: 30
+    # Cooldown period after scaling (in seconds)
+    cooldownPeriod: 300

 # AI Agent Configuration
 # Deploy this to run an AI Agent within your Kubernetes cluster