feat: add CPU and memory utilization metrics for KEDA autoscaling

2026-04-06 00:32:12 +02:00 · 2026-04-02 14:23:39 +01:00
parent 407d4e3687
commit eb4010dfa5
4 changed files with 23 additions and 1 deletions
--- a/HelmChart/Public/oneuptime/templates/_helpers.tpl
+++ b/HelmChart/Public/oneuptime/templates/_helpers.tpl
@@ -838,6 +838,18 @@ spec:
      # authenticationRef:
      #   name: {{ printf "%s-%s-trigger-auth" $.Release.Name $.ServiceName }}
    {{- end }}
+    {{- if and .MetricsConfig.targetCPUUtilizationPercentage (gt (int .MetricsConfig.targetCPUUtilizationPercentage) 0) }}
+    - type: cpu
+      metricType: Utilization
+      metadata:
+        value: {{ .MetricsConfig.targetCPUUtilizationPercentage | quote }}
+    {{- end }}
+    {{- if and .MetricsConfig.targetMemoryUtilizationPercentage (gt (int .MetricsConfig.targetMemoryUtilizationPercentage) 0) }}
+    - type: memory
+      metricType: Utilization
+      metadata:
+        value: {{ .MetricsConfig.targetMemoryUtilizationPercentage | quote }}
+    {{- end }}
 ---
 apiVersion: keda.sh/v1alpha1
 kind: TriggerAuthentication
--- a/HelmChart/Public/oneuptime/templates/keda-scaledobjects.yaml
+++ b/HelmChart/Public/oneuptime/templates/keda-scaledobjects.yaml
@@ -19,7 +19,7 @@ KEDA ScaledObjects for various services

 {{/* App KEDA ScaledObject - scales based on worker and telemetry queue sizes */}}
 {{- if and .Values.keda.enabled .Values.app.enabled .Values.app.keda.enabled (not .Values.app.disableAutoscaler) (not .Values.deployment.disableDeployments) }}
-{{- $metricsConfig := dict "enabled" .Values.app.keda.enabled "minReplicas" .Values.app.keda.minReplicas "maxReplicas" .Values.app.keda.maxReplicas "pollingInterval" .Values.app.keda.pollingInterval "cooldownPeriod" .Values.app.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_app_worker_queue_size" "threshold" .Values.app.keda.workerQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/worker/metrics/queue-size") (dict "query" "oneuptime_app_telemetry_queue_size" "threshold" .Values.app.keda.telemetryQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/telemetry/metrics/queue-size")) }}
+{{- $metricsConfig := dict "enabled" .Values.app.keda.enabled "minReplicas" .Values.app.keda.minReplicas "maxReplicas" .Values.app.keda.maxReplicas "pollingInterval" .Values.app.keda.pollingInterval "cooldownPeriod" .Values.app.keda.cooldownPeriod "targetCPUUtilizationPercentage" .Values.app.keda.targetCPUUtilizationPercentage "targetMemoryUtilizationPercentage" .Values.app.keda.targetMemoryUtilizationPercentage "triggers" (list (dict "query" "oneuptime_app_worker_queue_size" "threshold" .Values.app.keda.workerQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/worker/metrics/queue-size") (dict "query" "oneuptime_app_telemetry_queue_size" "threshold" .Values.app.keda.telemetryQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/telemetry/metrics/queue-size")) }}
 {{- $appKedaArgs := dict "ServiceName" "app" "Release" .Release "Values" .Values "MetricsConfig" $metricsConfig "DisableAutoscaler" .Values.app.disableAutoscaler }}
 {{- include "oneuptime.kedaScaledObject" $appKedaArgs }}
 {{- end }}
--- a/HelmChart/Public/oneuptime/values.schema.json
+++ b/HelmChart/Public/oneuptime/values.schema.json
@@ -1813,6 +1813,12 @@
                        "telemetryQueueSizeThreshold": {
                            "type": "integer"
                        },
+                        "targetCPUUtilizationPercentage": {
+                            "type": "integer"
+                        },
+                        "targetMemoryUtilizationPercentage": {
+                            "type": "integer"
+                        },
                        "pollingInterval": {
                            "type": "integer"
                        },
--- a/HelmChart/Public/oneuptime/values.yaml
+++ b/HelmChart/Public/oneuptime/values.yaml
@@ -671,6 +671,10 @@ app:
    workerQueueSizeThreshold: 10
    # Scale up when telemetry queue size exceeds this threshold
    telemetryQueueSizeThreshold: 10
+    # Scale up when average CPU utilization exceeds this percentage (0 to disable)
+    targetCPUUtilizationPercentage: 80
+    # Scale up when average memory utilization exceeds this percentage (0 to disable)
+    targetMemoryUtilizationPercentage: 80
    # Polling interval for metrics (in seconds)
    pollingInterval: 30
    # Cooldown period after scaling (in seconds)