feat: add KEDA autoscaling configuration for worker and telemetry queue metrics

This commit is contained in:
Nawaz Dhandala
2026-04-02 14:20:57 +01:00
parent 6f7907102b
commit 407d4e3687
5 changed files with 52 additions and 3 deletions

View File

@@ -832,7 +832,7 @@ spec:
- type: metrics-api
metadata:
targetValue: {{ .threshold | quote }}
url: http://{{ printf "%s-%s" $.Release.Name $.ServiceName }}:{{ .port }}/metrics/queue-size
url: http://{{ printf "%s-%s" $.Release.Name $.ServiceName }}:{{ .port }}{{ if .urlPath }}{{ .urlPath }}{{ else }}/metrics/queue-size{{ end }}
valueLocation: 'queueSize'
method: 'GET'
# authenticationRef:

View File

@@ -17,6 +17,7 @@ spec:
selector:
matchLabels:
app: {{ printf "%s-%s" $.Release.Name "app" }}
{{- if not (and $.Values.keda.enabled $.Values.app.keda.enabled) }}
{{- if and $.Values.app (hasKey $.Values.app "replicaCount") (ne $.Values.app.replicaCount nil) }}
replicas: {{ $.Values.app.replicaCount }}
{{- else }}
@@ -24,6 +25,7 @@ spec:
replicas: {{ $.Values.deployment.replicaCount }}
{{- end }}
{{- end }}
{{- end }}
strategy: {{- toYaml $.Values.deployment.updateStrategy | nindent 4 }}
template:
metadata:
@@ -136,8 +138,8 @@ spec:
---
# OneUptime app autoscaler
{{- if not $.Values.app.disableAutoscaler }}
# OneUptime app autoscaler (skip HPA when KEDA is managing scaling)
{{- if and (not $.Values.app.disableAutoscaler) (not (and $.Values.keda.enabled $.Values.app.keda.enabled)) }}
{{- $appAutoScalerArgs := dict "ServiceName" "app" "Release" $.Release "Values" $.Values -}}
{{- include "oneuptime.autoscaler" $appAutoScalerArgs }}
{{- end }}

View File

@@ -17,6 +17,13 @@ KEDA ScaledObjects for various services
{{- end }}
{{- end }}
{{/* App KEDA ScaledObject - scales based on worker and telemetry queue sizes */}}
{{- if and .Values.keda.enabled .Values.app.enabled .Values.app.keda.enabled (not .Values.app.disableAutoscaler) (not .Values.deployment.disableDeployments) }}
{{- $metricsConfig := dict "enabled" .Values.app.keda.enabled "minReplicas" .Values.app.keda.minReplicas "maxReplicas" .Values.app.keda.maxReplicas "pollingInterval" .Values.app.keda.pollingInterval "cooldownPeriod" .Values.app.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_app_worker_queue_size" "threshold" .Values.app.keda.workerQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/worker/metrics/queue-size") (dict "query" "oneuptime_app_telemetry_queue_size" "threshold" .Values.app.keda.telemetryQueueSizeThreshold "port" .Values.app.ports.http "urlPath" "/telemetry/metrics/queue-size")) }}
{{- $appKedaArgs := dict "ServiceName" "app" "Release" .Release "Values" .Values "MetricsConfig" $metricsConfig "DisableAutoscaler" .Values.app.disableAutoscaler }}
{{- include "oneuptime.kedaScaledObject" $appKedaArgs }}
{{- end }}
{{/* AI Agent KEDA ScaledObject */}}
{{- if and .Values.keda.enabled .Values.aiAgent.enabled .Values.aiAgent.keda.enabled (not .Values.aiAgent.disableAutoscaler) (not .Values.deployment.disableDeployments) }}
{{- $metricsConfig := dict "enabled" .Values.aiAgent.keda.enabled "minReplicas" .Values.aiAgent.keda.minReplicas "maxReplicas" .Values.aiAgent.keda.maxReplicas "pollingInterval" .Values.aiAgent.keda.pollingInterval "cooldownPeriod" .Values.aiAgent.keda.cooldownPeriod "triggers" (list (dict "query" "oneuptime_ai_agent_queue_size" "threshold" .Values.aiAgent.keda.queueSizeThreshold "port" .Values.aiAgent.ports.http)) }}

View File

@@ -1794,6 +1794,33 @@
},
"enableProfiling": {
"type": "boolean"
},
"keda": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"minReplicas": {
"type": "integer"
},
"maxReplicas": {
"type": "integer"
},
"workerQueueSizeThreshold": {
"type": "integer"
},
"telemetryQueueSizeThreshold": {
"type": "integer"
},
"pollingInterval": {
"type": "integer"
},
"cooldownPeriod": {
"type": "integer"
}
},
"additionalProperties": false
}
},
"additionalProperties": false

View File

@@ -662,6 +662,19 @@ app:
nodeSelector: {}
podSecurityContext: {}
containerSecurityContext: {}
# KEDA autoscaling configuration based on worker and telemetry queue metrics
keda:
enabled: false
minReplicas: 1
maxReplicas: 100
# Scale up when worker queue size exceeds this threshold
workerQueueSizeThreshold: 10
# Scale up when telemetry queue size exceeds this threshold
telemetryQueueSizeThreshold: 10
# Polling interval for metrics (in seconds)
pollingInterval: 30
# Cooldown period after scaling (in seconds)
cooldownPeriod: 300
# AI Agent Configuration
# Deploy this to run an AI Agent within your Kubernetes cluster