From f9c90d7143da2516c02d36067597121bf24a4e32 Mon Sep 17 00:00:00 2001 From: Nawaz Dhandala Date: Thu, 5 Mar 2026 09:36:11 +0000 Subject: [PATCH] refactor: remove OpenTelemetry Collector and integrate telemetry service - Deleted the otel-collector job from GitHub workflows and related deployment configurations. - Updated Helm charts to remove references to the OpenTelemetry Collector, including its deployment and service definitions. - Added gRPC server functionality directly in the telemetry service to handle telemetry ingestion. - Updated environment variables and Docker configurations to reflect the removal of the OpenTelemetry Collector. - Adjusted telemetry service to support both HTTP and gRPC protocols for telemetry data. --- .github/workflows/build.yml | 23 -- .github/workflows/release.yml | 76 +---- .github/workflows/test-release.yaml | 72 +---- .../Public/oneuptime/templates/_helpers.tpl | 2 - .../oneuptime/templates/otel-collector.yaml | 148 +--------- .../Public/oneuptime/templates/telemetry.yaml | 5 +- HelmChart/Public/oneuptime/values.schema.json | 60 +--- HelmChart/Public/oneuptime/values.yaml | 20 +- Nginx/default.conf.template | 18 +- Telemetry/GrpcServer.ts | 262 ++++++++++++++++++ Telemetry/Index.ts | 5 + .../ProtoFiles/OTel/v1/logs_service.proto | 41 +++ .../ProtoFiles/OTel/v1/metrics_service.proto | 41 +++ .../ProtoFiles/OTel/v1/trace_service.proto | 41 +++ Telemetry/package.json | 2 + config.example.env | 11 +- docker-compose.base.yml | 18 -- docker-compose.dev.yml | 9 - docker-compose.yml | 6 - 19 files changed, 412 insertions(+), 448 deletions(-) create mode 100644 Telemetry/GrpcServer.ts create mode 100644 Telemetry/ProtoFiles/OTel/v1/logs_service.proto create mode 100644 Telemetry/ProtoFiles/OTel/v1/metrics_service.proto create mode 100644 Telemetry/ProtoFiles/OTel/v1/trace_service.proto diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1725b5e713..f8fd6b52a9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -57,29 +57,6 @@ jobs: command: sudo docker build --no-cache -f ./Worker/Dockerfile . - docker-build-otel-collector: - runs-on: ubuntu-latest - env: - CI_PIPELINE_ID: ${{github.run_number}} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Preinstall - uses: nick-fields/retry@v3 - with: - timeout_minutes: 10 - max_attempts: 3 - command: npm run prerun - - # build image for accounts service - - name: build docker image - uses: nick-fields/retry@v3 - with: - timeout_minutes: 45 - max_attempts: 3 - command: sudo docker build --no-cache -f ./OTelCollector/Dockerfile . - docker-build-app: runs-on: ubuntu-latest env: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8c4a6b5677..cecc11b03c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -423,76 +423,6 @@ jobs: --platforms linux/amd64,linux/arm64 \ --git-sha "${{ github.sha }}" - otel-collector-docker-image-deploy: - needs: [generate-build-number, read-version] - runs-on: ubuntu-latest - env: - QEMU_CPU: max - steps: - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - name: Docker Meta - id: meta - uses: docker/metadata-action@v4 - with: - images: | - oneuptime/otel-collector - ghcr.io/oneuptime/otel-collector - tags: | - type=raw,value=release,enable=true - type=semver,value=${{needs.read-version.outputs.major_minor}},pattern={{version}},enable=true - - - uses: actions/checkout@v4 - with: - ref: ${{ github.ref }} - - - uses: actions/setup-node@v4 - with: - node-version: latest - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - image: tonistiigi/binfmt:qemu-v10.0.4 - - - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Generate Dockerfile from Dockerfile.tpl - run: npm run prerun - - # Build and deploy otel-collector. - - - name: Login to Docker Hub - run: | - echo "${{ secrets.DOCKERHUB_PASSWORD }}" | docker login --username "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin - - - name: Login to GitHub Container Registry - run: | - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io --username "${{ github.repository_owner }}" --password-stdin - - - name: Build and push - run: | - bash ./Scripts/GHA/build_docker_images.sh \ - --image otel-collector \ - --version "${{needs.read-version.outputs.major_minor}}" \ - --dockerfile ./OTelCollector/Dockerfile \ - --context . \ - --platforms linux/amd64,linux/arm64 \ - --git-sha "${{ github.sha }}" - - - test-docker-image-deploy: needs: [generate-build-number, read-version] runs-on: ubuntu-latest @@ -1001,7 +931,6 @@ jobs: - e2e-docker-image-deploy - home-docker-image-deploy - test-server-docker-image-deploy - - otel-collector-docker-image-deploy - test-docker-image-deploy - telemetry-docker-image-deploy - probe-docker-image-deploy @@ -1020,7 +949,6 @@ jobs: "e2e", "home", "test-server", - "otel-collector", "test", "telemetry", "probe", @@ -1073,7 +1001,7 @@ jobs: test-e2e-release-saas: runs-on: ubuntu-latest - needs: [telemetry-docker-image-deploy, ai-agent-docker-image-deploy, app-docker-image-deploy, home-docker-image-deploy, worker-docker-image-deploy, otel-collector-docker-image-deploy, probe-docker-image-deploy, test-docker-image-deploy, test-server-docker-image-deploy, publish-npm-packages, e2e-docker-image-deploy, helm-chart-deploy, generate-build-number, read-version, nginx-docker-image-deploy] + needs: [telemetry-docker-image-deploy, ai-agent-docker-image-deploy, app-docker-image-deploy, home-docker-image-deploy, worker-docker-image-deploy, probe-docker-image-deploy, test-docker-image-deploy, test-server-docker-image-deploy, publish-npm-packages, e2e-docker-image-deploy, helm-chart-deploy, generate-build-number, read-version, nginx-docker-image-deploy] env: CI_PIPELINE_ID: ${{github.run_number}} steps: @@ -1204,7 +1132,7 @@ jobs: test-e2e-release-self-hosted: runs-on: ubuntu-latest # After all the jobs runs - needs: [telemetry-docker-image-deploy, ai-agent-docker-image-deploy, app-docker-image-deploy, home-docker-image-deploy, worker-docker-image-deploy, otel-collector-docker-image-deploy, probe-docker-image-deploy, test-docker-image-deploy, test-server-docker-image-deploy, publish-npm-packages, e2e-docker-image-deploy, helm-chart-deploy, generate-build-number, read-version, nginx-docker-image-deploy] + needs: [telemetry-docker-image-deploy, ai-agent-docker-image-deploy, app-docker-image-deploy, home-docker-image-deploy, worker-docker-image-deploy, probe-docker-image-deploy, test-docker-image-deploy, test-server-docker-image-deploy, publish-npm-packages, e2e-docker-image-deploy, helm-chart-deploy, generate-build-number, read-version, nginx-docker-image-deploy] env: CI_PIPELINE_ID: ${{github.run_number}} steps: diff --git a/.github/workflows/test-release.yaml b/.github/workflows/test-release.yaml index cd9e5a67eb..52da509ee2 100644 --- a/.github/workflows/test-release.yaml +++ b/.github/workflows/test-release.yaml @@ -297,76 +297,6 @@ jobs: --extra-tags test \ --extra-enterprise-tags enterprise-test - otel-collector-docker-image-deploy: - needs: [read-version, generate-build-number] - runs-on: ubuntu-latest - env: - QEMU_CPU: max - steps: - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - name: Docker Meta - id: meta - uses: docker/metadata-action@v4 - with: - images: | - oneuptime/otel-collector - ghcr.io/oneuptime/otel-collector - tags: | - type=raw,value=test,enable=true - type=raw,value=${{needs.read-version.outputs.major_minor}}-test,enable=true - - - - uses: actions/checkout@v4 - with: - ref: ${{ github.ref }} - - - uses: actions/setup-node@v4 - with: - node-version: latest - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - image: tonistiigi/binfmt:qemu-v10.0.4 - - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Generate Dockerfile from Dockerfile.tpl - run: npm run prerun - - # Build and deploy otel-collector. - - - name: Login to Docker Hub - run: | - echo "${{ secrets.DOCKERHUB_PASSWORD }}" | docker login --username "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin - - - name: Login to GitHub Container Registry - run: | - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io --username "${{ github.repository_owner }}" --password-stdin - - - name: Build and push - run: | - bash ./Scripts/GHA/build_docker_images.sh \ - --image otel-collector \ - --version "${{needs.read-version.outputs.major_minor}}-test" \ - --dockerfile ./OTelCollector/Dockerfile \ - --context . \ - --platforms linux/amd64,linux/arm64 \ - --git-sha "${{ github.sha }}" \ - --extra-tags test \ - --extra-enterprise-tags enterprise-test - home-docker-image-deploy: needs: [read-version, generate-build-number] runs-on: ubuntu-latest @@ -879,7 +809,7 @@ jobs: test-helm-chart: runs-on: ubuntu-latest - needs: [infrastructure-agent-deploy, publish-terraform-provider, telemetry-docker-image-deploy, worker-docker-image-deploy, home-docker-image-deploy, test-server-docker-image-deploy, test-docker-image-deploy, probe-docker-image-deploy, app-docker-image-deploy, ai-agent-docker-image-deploy, otel-collector-docker-image-deploy, nginx-docker-image-deploy, e2e-docker-image-deploy] + needs: [infrastructure-agent-deploy, publish-terraform-provider, telemetry-docker-image-deploy, worker-docker-image-deploy, home-docker-image-deploy, test-server-docker-image-deploy, test-docker-image-deploy, probe-docker-image-deploy, app-docker-image-deploy, ai-agent-docker-image-deploy, nginx-docker-image-deploy, e2e-docker-image-deploy] env: CI_PIPELINE_ID: ${{github.run_number}} steps: diff --git a/HelmChart/Public/oneuptime/templates/_helpers.tpl b/HelmChart/Public/oneuptime/templates/_helpers.tpl index 16d6ad1938..c7520000c9 100644 --- a/HelmChart/Public/oneuptime/templates/_helpers.tpl +++ b/HelmChart/Public/oneuptime/templates/_helpers.tpl @@ -113,8 +113,6 @@ Usage: value: {{ $.Release.Name }}-telemetry.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }} - name: SERVER_TELEMETRY_HOSTNAME value: {{ $.Release.Name }}-telemetry.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }} -- name: SERVER_OTEL_COLLECTOR_HOSTNAME - value: {{ $.Release.Name }}-otel-collector.{{ $.Release.Namespace }}.svc.{{ $.Values.global.clusterDomain }} - name: APP_PORT value: {{ $.Values.app.ports.http | squote }} - name: TELEMETRY_PORT diff --git a/HelmChart/Public/oneuptime/templates/otel-collector.yaml b/HelmChart/Public/oneuptime/templates/otel-collector.yaml index f19fe395f2..665f3084fd 100644 --- a/HelmChart/Public/oneuptime/templates/otel-collector.yaml +++ b/HelmChart/Public/oneuptime/templates/otel-collector.yaml @@ -1,147 +1 @@ -{{- if $.Values.openTelemetryCollector.enabled }} -# OneUptime otel-collector Deployment - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ printf "%s-%s" $.Release.Name "otel-collector" }} - namespace: {{ $.Release.Namespace }} - labels: - app: {{ printf "%s-%s" $.Release.Name "otel-collector" }} - app.kubernetes.io/part-of: oneuptime - app.kubernetes.io/managed-by: Helm - appname: oneuptime - {{- if $.Values.deployment.includeTimestampLabel }} - date: "{{ now | unixEpoch }}" - {{- end }} -spec: - selector: - matchLabels: - app: {{ printf "%s-%s" $.Release.Name "otel-collector" }} - {{- if $.Values.openTelemetryCollector.replicaCount }} - replicas: {{ $.Values.openTelemetryCollector.replicaCount }} - {{- else }} - {{- if or (not $.Values.autoscaling.enabled) ($.Values.openTelemetryCollector.disableAutoscaler) }} - replicas: {{ $.Values.deployment.replicaCount }} - {{- end }} - {{- end }} - strategy: {{- toYaml $.Values.deployment.updateStrategy | nindent 4 }} - template: - metadata: - labels: - app: {{ printf "%s-%s" $.Release.Name "otel-collector" }} - {{- if $.Values.deployment.includeTimestampLabel }} - date: "{{ now | unixEpoch }}" - {{- end }} - appname: oneuptime - spec: - volumes: - - name: greenlockrc - emptyDir: - sizeLimit: "1Gi" - {{- if $.Values.openTelemetryCollector.podSecurityContext }} - securityContext: - {{- toYaml $.Values.openTelemetryCollector.podSecurityContext | nindent 8 }} - {{- else if $.Values.podSecurityContext }} - securityContext: - {{- toYaml $.Values.podSecurityContext | nindent 8 }} - {{- end }} - {{- if $.Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml $.Values.imagePullSecrets | nindent 8 }} - {{- end }} - {{- if $.Values.affinity }} - affinity: {{- $.Values.affinity | toYaml | nindent 8 }} - {{- end }} - {{- if $.Values.tolerations }} - tolerations: {{- $.Values.tolerations | toYaml | nindent 8 }} - {{- end }} - {{- if $.Values.openTelemetryCollector.nodeSelector }} - nodeSelector: - {{- toYaml $.Values.openTelemetryCollector.nodeSelector | nindent 8 }} - {{- else if $.Values.nodeSelector }} - nodeSelector: - {{- toYaml $.Values.nodeSelector | nindent 8 }} - {{- end }} - containers: - - image: {{ include "oneuptime.image" (dict "Values" $.Values "ServiceName" "otel-collector") }} - name: {{ printf "%s-%s" $.Release.Name "otel-collector" }} - # Liveness probe - {{- if $.Values.startupProbe.enabled }} - # Startup probe - startupProbe: - httpGet: - path: /health/status - port: 13133 - periodSeconds: {{ $.Values.startupProbe.periodSeconds }} - failureThreshold: {{ $.Values.startupProbe.failureThreshold }} - {{- end }} - {{- if $.Values.livenessProbe.enabled }} - # Liveness probe - livenessProbe: - httpGet: - path: /health/status - port: 13133 - periodSeconds: {{ $.Values.livenessProbe.periodSeconds }} - timeoutSeconds: {{ $.Values.livenessProbe.timeoutSeconds }} - initialDelaySeconds: {{ $.Values.livenessProbe.initialDelaySeconds }} - {{- end }} - {{- if $.Values.readinessProbe.enabled }} - # Readyness Probe - readinessProbe: - httpGet: - path: /health/status - port: 13133 - periodSeconds: {{ $.Values.readinessProbe.periodSeconds }} - initialDelaySeconds: {{ $.Values.readinessProbe.initialDelaySeconds }} - timeoutSeconds: {{ $.Values.readinessProbe.timeoutSeconds }} - {{- end }} - {{- if $.Values.openTelemetryCollector.containerSecurityContext }} - securityContext: - {{- toYaml $.Values.openTelemetryCollector.containerSecurityContext | nindent 12 }} - {{- else if $.Values.containerSecurityContext }} - securityContext: - {{- toYaml $.Values.containerSecurityContext | nindent 12 }} - {{- end }} - imagePullPolicy: {{ $.Values.image.pullPolicy }} - env: - {{- include "oneuptime.env.common" . | nindent 12 }} - {{- include "oneuptime.env.runtime" (dict "Values" $.Values "Release" $.Release) | nindent 12 }} - - name: PORT - value: {{ $.Values.openTelemetryCollector.ports.grpc | quote }} - - name: OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_ENABLED - value: {{ $.Values.openTelemetryCollector.sendingQueue.enabled | quote }} - - name: OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_NUM_CONSUMERS - value: {{ $.Values.openTelemetryCollector.sendingQueue.numConsumers | quote }} - - name: OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_SIZE - value: {{ $.Values.openTelemetryCollector.sendingQueue.size | quote }} - - name: DISABLE_TELEMETRY - value: {{ $.Values.openTelemetryCollector.disableTelemetryCollection | quote }} - ports: - - containerPort: {{ $.Values.openTelemetryCollector.ports.http }} - protocol: TCP - name: http - - containerPort: {{ $.Values.openTelemetryCollector.ports.grpc }} - protocol: TCP - name: grpc - {{- if $.Values.openTelemetryCollector.resources }} - resources: - {{- toYaml $.Values.openTelemetryCollector.resources | nindent 12 }} - {{- end }} - restartPolicy: {{ $.Values.image.restartPolicy }} ---- - -# OneUptime otel-collector autoscaler -{{- if not $.Values.openTelemetryCollector.disableAutoscaler }} -{{- $identityAutoScalerArgs := dict "ServiceName" "otel-collector" "Release" $.Release "Values" $.Values -}} -{{- include "oneuptime.autoscaler" $identityAutoScalerArgs }} -{{- end }} ---- - -{{- end }} - -# OneUptime otel-collector Service -{{- $otelCollectorPorts := dict "grpc" $.Values.openTelemetryCollector.ports.grpc "http" $.Values.openTelemetryCollector.ports.http -}} -{{- $identityServiceArgs := dict "ServiceName" "otel-collector" "Ports" $otelCollectorPorts "Release" $.Release "Values" $.Values -}} -{{- include "oneuptime.service" $identityServiceArgs }} ---- +{{- /* OTel Collector has been removed. Telemetry ingestion (gRPC + HTTP) is now handled directly by the telemetry service. */ -}} diff --git a/HelmChart/Public/oneuptime/templates/telemetry.yaml b/HelmChart/Public/oneuptime/templates/telemetry.yaml index 6407333788..fb26ef9542 100644 --- a/HelmChart/Public/oneuptime/templates/telemetry.yaml +++ b/HelmChart/Public/oneuptime/templates/telemetry.yaml @@ -117,6 +117,9 @@ spec: - containerPort: {{ $.Values.telemetry.ports.http }} protocol: TCP name: http + - containerPort: {{ $.Values.telemetry.ports.grpc }} + protocol: TCP + name: grpc {{- if $.Values.telemetry.resources }} resources: {{- toYaml $.Values.telemetry.resources | nindent 12 }} @@ -135,7 +138,7 @@ spec: --- # OneUptime telemetry Service -{{- $telemetryPorts := dict "port" $.Values.telemetry.ports.http -}} +{{- $telemetryPorts := dict "http" $.Values.telemetry.ports.http "grpc" $.Values.telemetry.ports.grpc -}} {{- $telemetryServiceArgs := dict "ServiceName" "telemetry" "Ports" $telemetryPorts "Release" $.Release "Values" $.Values -}} {{- include "oneuptime.service" $telemetryServiceArgs }} --- diff --git a/HelmChart/Public/oneuptime/values.schema.json b/HelmChart/Public/oneuptime/values.schema.json index e4ba8cc9e9..61e71aac9f 100644 --- a/HelmChart/Public/oneuptime/values.schema.json +++ b/HelmChart/Public/oneuptime/values.schema.json @@ -1262,63 +1262,6 @@ }, "additionalProperties": false }, - "openTelemetryCollector": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - }, - "replicaCount": { - "type": "integer" - }, - "disableTelemetryCollection": { - "type": "boolean" - }, - "disableAutoscaler": { - "type": "boolean" - }, - "ports": { - "type": "object", - "properties": { - "grpc": { - "type": "integer" - }, - "http": { - "type": "integer" - } - }, - "additionalProperties": false - }, - "sendingQueue": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - }, - "size": { - "type": "integer" - }, - "numConsumers": { - "type": "integer" - } - }, - "additionalProperties": false - }, - "resources": { - "type": ["object", "null"] - }, - "nodeSelector": { - "type": "object" - }, - "podSecurityContext": { - "type": "object" - }, - "containerSecurityContext": { - "type": "object" - } - }, - "additionalProperties": false - }, "home": { "type": "object", "properties": { @@ -1528,6 +1471,9 @@ "properties": { "http": { "type": "integer" + }, + "grpc": { + "type": "integer" } }, "additionalProperties": false diff --git a/HelmChart/Public/oneuptime/values.yaml b/HelmChart/Public/oneuptime/values.yaml index 6a89d7df1d..ed0aec1e69 100644 --- a/HelmChart/Public/oneuptime/values.yaml +++ b/HelmChart/Public/oneuptime/values.yaml @@ -558,25 +558,6 @@ readinessProbe: # Readiness probe configuration initialDelaySeconds: 10 timeoutSeconds: 120 -# OpenTelemetry Collector Configuration -openTelemetryCollector: - enabled: true - replicaCount: 1 - disableTelemetryCollection: false - disableAutoscaler: false - ports: - grpc: 4317 - http: 4318 - sendingQueue: - enabled: true - size: 1000 - numConsumers: 3 - resources: {} - nodeSelector: {} - podSecurityContext: {} - containerSecurityContext: {} - - home: enabled: true replicaCount: 1 @@ -636,6 +617,7 @@ telemetry: concurrency: 100 ports: http: 3403 + grpc: 4317 resources: nodeSelector: {} podSecurityContext: {} diff --git a/Nginx/default.conf.template b/Nginx/default.conf.template index 8f3ebf2964..3dcba40486 100644 --- a/Nginx/default.conf.template +++ b/Nginx/default.conf.template @@ -19,12 +19,8 @@ upstream home { server ${SERVER_HOME_HOSTNAME}:${HOME_PORT} weight=10 max_fails=3 fail_timeout=30s; } -upstream opentelemetry-collector-http { - server ${SERVER_OTEL_COLLECTOR_HOSTNAME}:4318; -} - -upstream opentelemetry-collector-grpc { - server ${SERVER_OTEL_COLLECTOR_HOSTNAME}:4317; +upstream opentelemetry-grpc { + server ${SERVER_TELEMETRY_HOSTNAME}:4317; } # Status Pages @@ -472,12 +468,12 @@ ${PROVISION_SSL_CERTIFICATE_KEY_DIRECTIVE} location /otlp/ { - # This is for nginx not to crash when service is not available. + # This is for nginx not to crash when service is not available. resolver 127.0.0.1 valid=30s; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Proto $scheme; # enable WebSockets (for ws://sockjs not connected error in the accounts source: https://stackoverflow.com/questions/41381444/websocket-connection-failed-error-during-websocket-handshake-unexpected-respon) proxy_http_version 1.1; @@ -485,14 +481,14 @@ ${PROVISION_SSL_CERTIFICATE_KEY_DIRECTIVE} proxy_set_header Connection "upgrade"; - proxy_pass http://opentelemetry-collector-http/; + proxy_pass http://telemetry/otlp/; } location ~ /opentelemetry.proto.collector* { - # This is for nginx not to crash when service is not available. + # This is for nginx not to crash when service is not available. resolver 127.0.0.1 valid=30s; - grpc_pass grpc://opentelemetry-collector-grpc; + grpc_pass grpc://opentelemetry-grpc; } location /notification { diff --git a/Telemetry/GrpcServer.ts b/Telemetry/GrpcServer.ts new file mode 100644 index 0000000000..cb7002ad9f --- /dev/null +++ b/Telemetry/GrpcServer.ts @@ -0,0 +1,262 @@ +import * as grpc from "@grpc/grpc-js"; +import * as protoLoader from "@grpc/proto-loader"; +import path from "path"; +import logger from "Common/Server/Utils/Logger"; +import ObjectID from "Common/Types/ObjectID"; +import ProductType from "Common/Types/MeteredPlan/ProductType"; +import TelemetryIngestionKeyService from "Common/Server/Services/TelemetryIngestionKeyService"; +import TelemetryIngestionKey from "Common/Models/DatabaseModels/TelemetryIngestionKey"; +import { TelemetryRequest } from "Common/Server/Middleware/TelemetryIngest"; +import TracesQueueService from "./Services/Queue/TracesQueueService"; +import LogsQueueService from "./Services/Queue/LogsQueueService"; +import MetricsQueueService from "./Services/Queue/MetricsQueueService"; + +const GRPC_PORT: number = 4317; + +const PROTO_DIR: string = path.resolve( + __dirname, + "ProtoFiles", + "OTel", + "v1", +); + +type GrpcCallback = ( + error: grpc.ServiceError | null, + response?: Record, +) => void; + +interface GrpcCall { + request: Record; + metadata: grpc.Metadata; +} + +async function authenticateRequest( + metadata: grpc.Metadata, +): Promise { + const tokenValues: grpc.MetadataValue[] = + metadata.get("x-oneuptime-token"); + + let oneuptimeToken: string | undefined = tokenValues[0]?.toString(); + + if (!oneuptimeToken) { + const serviceTokenValues: grpc.MetadataValue[] = metadata.get( + "x-oneuptime-service-token", + ); + oneuptimeToken = serviceTokenValues[0]?.toString(); + } + + if (!oneuptimeToken) { + logger.error("gRPC: Missing metadata: x-oneuptime-token"); + return null; + } + + const token: TelemetryIngestionKey | null = + await TelemetryIngestionKeyService.findOneBy({ + query: { + secretKey: new ObjectID(oneuptimeToken), + }, + select: { + projectId: true, + }, + props: { + isRoot: true, + }, + }); + + if (!token || !token.projectId) { + logger.error("gRPC: Invalid service token: " + oneuptimeToken); + return null; + } + + return token.projectId as ObjectID; +} + +function buildTelemetryRequest( + body: Record, + metadata: grpc.Metadata, + projectId: ObjectID, + productType: ProductType, +): TelemetryRequest { + const headers: Record = {}; + + for (const key of metadata.keys()) { + const values: grpc.MetadataValue[] = metadata.get(key); + if (values.length > 0) { + headers[key] = values[0]!.toString(); + } + } + + const req: Partial = { + body: body, + headers: headers, + projectId: projectId, + productType: productType, + path: `/otlp/v1/${productType}`, + url: `/otlp/v1/${productType}`, + }; + + return req as TelemetryRequest; +} + +async function handleExport( + call: GrpcCall, + callback: GrpcCallback, + productType: ProductType, + queueFn: (req: TelemetryRequest) => Promise, +): Promise { + try { + const projectId: ObjectID | null = await authenticateRequest( + call.metadata, + ); + + if (!projectId) { + // Return success to avoid OTel SDK retries + callback(null, {}); + return; + } + + const body: Record = call.request; + + const req: TelemetryRequest = buildTelemetryRequest( + body, + call.metadata, + projectId, + productType, + ); + + await queueFn(req); + + callback(null, {}); + } catch (err) { + logger.error(`gRPC ${productType} export error:`); + logger.error(err); + // Return success to avoid OTel SDK retries + callback(null, {}); + } +} + +export function startGrpcServer(): void { + const traceServiceDef: protoLoader.PackageDefinition = protoLoader.loadSync( + path.join(PROTO_DIR, "trace_service.proto"), + { + keepCase: false, + longs: String, + enums: String, + defaults: true, + oneofs: true, + includeDirs: [PROTO_DIR], + }, + ); + + const logsServiceDef: protoLoader.PackageDefinition = protoLoader.loadSync( + path.join(PROTO_DIR, "logs_service.proto"), + { + keepCase: false, + longs: String, + enums: String, + defaults: true, + oneofs: true, + includeDirs: [PROTO_DIR], + }, + ); + + const metricsServiceDef: protoLoader.PackageDefinition = + protoLoader.loadSync( + path.join(PROTO_DIR, "metrics_service.proto"), + { + keepCase: false, + longs: String, + enums: String, + defaults: true, + oneofs: true, + includeDirs: [PROTO_DIR], + }, + ); + + const traceProto: grpc.GrpcObject = + grpc.loadPackageDefinition(traceServiceDef); + const logsProto: grpc.GrpcObject = + grpc.loadPackageDefinition(logsServiceDef); + const metricsProto: grpc.GrpcObject = + grpc.loadPackageDefinition(metricsServiceDef); + + type ProtoServiceDef = { + service: grpc.ServiceDefinition; + }; + + function getServiceDefinition( + proto: grpc.GrpcObject, + ...path: Array + ): grpc.ServiceDefinition { + let current: unknown = proto; + for (const key of path) { + current = (current as Record)[key]; + } + return (current as ProtoServiceDef).service; + } + + const traceServiceDefinition: grpc.ServiceDefinition = getServiceDefinition( + traceProto, + "opentelemetry", "proto", "collector", "trace", "v1", "TraceService", + ); + + const logsServiceDefinition: grpc.ServiceDefinition = getServiceDefinition( + logsProto, + "opentelemetry", "proto", "collector", "logs", "v1", "LogsService", + ); + + const metricsServiceDefinition: grpc.ServiceDefinition = getServiceDefinition( + metricsProto, + "opentelemetry", "proto", "collector", "metrics", "v1", "MetricsService", + ); + + const server: grpc.Server = new grpc.Server({ + "grpc.max_receive_message_length": 50 * 1024 * 1024, // 50MB + }); + + server.addService(traceServiceDefinition, { + Export: (call: GrpcCall, callback: GrpcCallback): void => { + handleExport( + call, + callback, + ProductType.Traces, + TracesQueueService.addTraceIngestJob.bind(TracesQueueService), + ); + }, + }); + + server.addService(logsServiceDefinition, { + Export: (call: GrpcCall, callback: GrpcCallback): void => { + handleExport( + call, + callback, + ProductType.Logs, + LogsQueueService.addLogIngestJob.bind(LogsQueueService), + ); + }, + }); + + server.addService(metricsServiceDefinition, { + Export: (call: GrpcCall, callback: GrpcCallback): void => { + handleExport( + call, + callback, + ProductType.Metrics, + MetricsQueueService.addMetricIngestJob.bind(MetricsQueueService), + ); + }, + }); + + server.bindAsync( + `0.0.0.0:${GRPC_PORT}`, + grpc.ServerCredentials.createInsecure(), + (err: Error | null, port: number): void => { + if (err) { + logger.error("Failed to start gRPC server:"); + logger.error(err); + return; + } + logger.info(`gRPC OTLP server started on port: ${port}`); + }, + ); +} diff --git a/Telemetry/Index.ts b/Telemetry/Index.ts index 5bc0eaba66..a0723a8d12 100644 --- a/Telemetry/Index.ts +++ b/Telemetry/Index.ts @@ -24,6 +24,7 @@ import Telemetry from "Common/Server/Utils/Telemetry"; import "./Jobs/TelemetryIngest/ProcessTelemetry"; import { TELEMETRY_CONCURRENCY } from "./Config"; import type { StatusAPIOptions } from "Common/Server/API/StatusAPI"; +import { startGrpcServer } from "./GrpcServer"; import "ejs"; const app: ExpressApplication = Express.getExpressApp(); @@ -104,6 +105,10 @@ const init: PromiseVoidFunction = async (): Promise => { ); await Realtime.init(); + + // Start gRPC OTLP server on port 4317 + startGrpcServer(); + // add default routes await App.addDefaultRoutes(); } catch (err) { diff --git a/Telemetry/ProtoFiles/OTel/v1/logs_service.proto b/Telemetry/ProtoFiles/OTel/v1/logs_service.proto new file mode 100644 index 0000000000..2d0cc687ba --- /dev/null +++ b/Telemetry/ProtoFiles/OTel/v1/logs_service.proto @@ -0,0 +1,41 @@ +// Copyright 2020, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.logs.v1; + +import "./logs.proto"; + +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.logs.v1"; +option java_outer_classname = "LogsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/logs/v1"; + +service LogsService { + rpc Export(ExportLogsServiceRequest) returns (ExportLogsServiceResponse) {} +} + +message ExportLogsServiceRequest { + repeated opentelemetry.proto.logs.v1.ResourceLogs resource_logs = 1; +} + +message ExportLogsServiceResponse { + ExportLogsPartialSuccess partial_success = 1; +} + +message ExportLogsPartialSuccess { + int64 rejected_log_records = 1; + string error_message = 2; +} diff --git a/Telemetry/ProtoFiles/OTel/v1/metrics_service.proto b/Telemetry/ProtoFiles/OTel/v1/metrics_service.proto new file mode 100644 index 0000000000..0ef61a3d34 --- /dev/null +++ b/Telemetry/ProtoFiles/OTel/v1/metrics_service.proto @@ -0,0 +1,41 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.metrics.v1; + +import "./metrics.proto"; + +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.metrics.v1"; +option java_outer_classname = "MetricsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/metrics/v1"; + +service MetricsService { + rpc Export(ExportMetricsServiceRequest) returns (ExportMetricsServiceResponse) {} +} + +message ExportMetricsServiceRequest { + repeated opentelemetry.proto.metrics.v1.ResourceMetrics resource_metrics = 1; +} + +message ExportMetricsServiceResponse { + ExportMetricsPartialSuccess partial_success = 1; +} + +message ExportMetricsPartialSuccess { + int64 rejected_data_points = 1; + string error_message = 2; +} diff --git a/Telemetry/ProtoFiles/OTel/v1/trace_service.proto b/Telemetry/ProtoFiles/OTel/v1/trace_service.proto new file mode 100644 index 0000000000..753d985629 --- /dev/null +++ b/Telemetry/ProtoFiles/OTel/v1/trace_service.proto @@ -0,0 +1,41 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.trace.v1; + +import "./traces.proto"; + +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.trace.v1"; +option java_outer_classname = "TraceServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/trace/v1"; + +service TraceService { + rpc Export(ExportTraceServiceRequest) returns (ExportTraceServiceResponse) {} +} + +message ExportTraceServiceRequest { + repeated opentelemetry.proto.trace.v1.ResourceSpans resource_spans = 1; +} + +message ExportTraceServiceResponse { + ExportTracePartialSuccess partial_success = 1; +} + +message ExportTracePartialSuccess { + int64 rejected_spans = 1; + string error_message = 2; +} diff --git a/Telemetry/package.json b/Telemetry/package.json index 03269e9100..336c9dad2a 100644 --- a/Telemetry/package.json +++ b/Telemetry/package.json @@ -19,6 +19,8 @@ "author": "OneUptime (https://oneuptime.com/)", "license": "Apache-2.0", "dependencies": { + "@grpc/grpc-js": "^1.12.5", + "@grpc/proto-loader": "^0.7.13", "Common": "file:../Common", "ejs": "^3.1.10", "protobufjs": "^7.3.2", diff --git a/config.example.env b/config.example.env index 263e882d81..7f6990c4a2 100644 --- a/config.example.env +++ b/config.example.env @@ -96,14 +96,12 @@ TELEMETRY_HOSTNAME=telemetry:3403 SERVER_APP_HOSTNAME=app SERVER_TELEMETRY_HOSTNAME=telemetry -SERVER_OTEL_COLLECTOR_HOSTNAME=otel-collector SERVER_WORKER_HOSTNAME=worker #Ports. Usually they don't need to change. APP_PORT=3002 TELEMETRY_PORT=3403 TEST_SERVER_PORT=3800 -OTEL_COLLECTOR_HTTP_PORT=4318 HOME_PORT=1444 WORKER_PORT=1445 # Plans @@ -193,7 +191,7 @@ DISABLE_AUTOMATIC_ALERT_CREATION=false # If you're using an extrenal open telemetry collector, you can set the endpoint here - both server and client endpoint can be the same in this case. -# You can set the env var to http://otel-collector:4318 if you want instrumentation to be sent to otel collector. +# You can set the env var to an OTLP endpoint if you want instrumentation to be exported. OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT= # You can set the env var to "x-oneuptime-token=" @@ -295,19 +293,12 @@ DISABLE_TELEMETRY_FOR_STATUS_PAGE=true DISABLE_TELEMETRY_FOR_DASHBOARD=true DISABLE_TELEMETRY_FOR_PROBE=true DISABLE_TELEMETRY_FOR_ADMIN_DASHBOARD=true -DISABLE_TELEMETRY_FOR_OTEL_COLLECTOR=true DISABLE_TELEMETRY_FOR_INGRESS=true DISABLE_TELEMETRY_FOR_WORKER=true DISABLE_TELEMETRY_FOR_AI_AGENT=true -# OPENTELEMETRY_COLLECTOR env vars -OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_ENABLED=true -OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_SIZE=1000 -OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_NUM_CONSUMERS=3 - - # Connect OneUptime with Slack App SLACK_APP_CLIENT_ID= SLACK_APP_CLIENT_SECRET= diff --git a/docker-compose.base.yml b/docker-compose.base.yml index 66996b5136..192cb84808 100644 --- a/docker-compose.base.yml +++ b/docker-compose.base.yml @@ -30,7 +30,6 @@ x-common-variables: &common-variables SERVER_APP_HOSTNAME: app SERVER_TELEMETRY_HOSTNAME: telemetry - SERVER_OTEL_COLLECTOR_HOSTNAME: otel-collector SERVER_WORKER_HOSTNAME: worker SERVER_HOME_HOSTNAME: home #Ports. Usually they don't need to change. @@ -335,23 +334,6 @@ services: options: max-size: "1000m" - otel-collector: - networks: - - oneuptime - environment: - <<: *common-variables - DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_OTEL_COLLECTOR} - OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_ENABLED: ${OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_ENABLED} - OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_SIZE: ${OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_SIZE} - OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_NUM_CONSUMERS: ${OPENTELEMETRY_COLLECTOR_SENDING_QUEUE_NUM_CONSUMERS} - restart: always - logging: - driver: "local" - options: - max-size: "1000m" - ports: - - 13133:13133 # Otel Collector Health Check Endpoint at /heath/status - fluentd: networks: - oneuptime diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index f2d152d7aa..94007ed7aa 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -36,15 +36,6 @@ services: service: postgres - otel-collector: - extends: - file: ./docker-compose.base.yml - service: otel-collector - build: - network: host - context: . - dockerfile: ./OTelCollector/Dockerfile - test-server: volumes: - ./TestServer:/usr/src/app:cached diff --git a/docker-compose.yml b/docker-compose.yml index 876d817f54..cdafeabfe4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,12 +29,6 @@ services: service: postgres - otel-collector: - image: oneuptime/otel-collector:${APP_TAG} - extends: - file: ./docker-compose.base.yml - service: otel-collector - app: image: oneuptime/app:${APP_TAG} extends: