feat: Implement OpenTelemetry Profiles roadmap documentation

2026-04-06 00:32:12 +02:00 · 2026-03-26 21:22:15 +00:00
parent 4b339f07ec
commit 50717e5167
3 changed files with 594 additions and 174 deletions
--- a/App/FeatureSet/PublicDashboard/src/Pages/DashboardView/DashboardViewPage.tsx
+++ b/App/FeatureSet/PublicDashboard/src/Pages/DashboardView/DashboardViewPage.tsx
@@ -34,12 +34,6 @@ import MoreMenuItem from "Common/UI/Components/MoreMenu/MoreMenuItem";
 import IconProp from "Common/Types/Icon/IconProp";
 import Button, { ButtonStyleType } from "Common/UI/Components/Button/Button";
 import DashboardVariableSelector from "./DashboardVariableSelector";
-import NavBar from "Common/UI/Components/Navbar/NavBar";
-import NavBarItem from "Common/UI/Components/Navbar/NavBarItem";
-import PageMap from "../../Utils/PageMap";
-import RouteMap, { RouteUtil } from "../../Utils/RouteMap";
-import PublicDashboardUtil from "../../Utils/PublicDashboard";
-import Route from "Common/Types/API/Route";

 export interface ComponentProps {
  dashboardId: ObjectID;
@@ -216,14 +210,6 @@ const DashboardViewPage: FunctionComponent<ComponentProps> = (
    return <PageLoader isVisible={true} />;
  }

-  const isPreview: boolean = PublicDashboardUtil.isPreviewPage();
-
-  const overviewRoute: Route = RouteUtil.populateRouteParams(
-    isPreview
-      ? (RouteMap[PageMap.PREVIEW_OVERVIEW] as Route)
-      : (RouteMap[PageMap.OVERVIEW] as Route),
-  );
-
  return (
    <div
      ref={dashboardViewRef}
@@ -233,174 +219,149 @@ const DashboardViewPage: FunctionComponent<ComponentProps> = (
        background: "#fafbfc",
      }}
    >
-      {/* Header and NavBar */}
-      <div className="max-w-5xl mx-auto px-3 sm:px-5">
-        <div className="flex items-center gap-4 mt-5">
-          {logoUrl && (
-            <img
-              src={logoUrl}
-              alt={dashboardName}
-              className="h-10 w-auto object-contain"
-            />
-          )}
-          <div>
-            <h1 className="text-xl font-semibold text-gray-900 truncate">
-              {dashboardName}
-            </h1>
-            {pageDescription && (
-              <p className="text-sm text-gray-500 mt-0.5 truncate">
-                {pageDescription}
-              </p>
-            )}
-          </div>
-        </div>
-
-        <NavBar className="bg-white flex text-center justify-between py-2 mt-5 rounded-lg shadow px-5">
-          <NavBarItem
-            id="overview-nav-bar-item"
-            title="Overview"
-            icon={IconProp.CheckCircle}
-            exact={true}
-            route={overviewRoute}
-          />
-        </NavBar>
-      </div>
-
-      {/* Public Dashboard Toolbar */}
+      {/* Header */}
      <div
-        className="mx-3 mt-3 mb-2 rounded-lg bg-white border border-gray-200"
+        className="bg-white border-b border-gray-200"
        style={{
-          boxShadow:
-            "0 1px 3px 0 rgba(0, 0, 0, 0.05), 0 1px 2px -1px rgba(0, 0, 0, 0.04)",
+          boxShadow: "0 1px 3px 0 rgba(0, 0, 0, 0.05)",
        }}
      >
-        <div
-          className="h-0.5 rounded-t-lg"
-          style={{
-            background: "linear-gradient(90deg, #6366f1 0%, #8b5cf6 100%)",
-          }}
-        ></div>
-        <div className="flex items-center justify-between px-5 py-3">
-          <div className="flex items-center gap-3 min-w-0">
-            {hasComponents && (
-              <span className="text-xs text-gray-400 tabular-nums">
-                {dashboardViewConfig.components.length} widget
-                {dashboardViewConfig.components.length !== 1 ? "s" : ""}
-              </span>
-            )}
-            {isRefreshing &&
-              autoRefreshInterval !== AutoRefreshInterval.OFF && (
-                <span className="inline-flex items-center gap-1.5 text-xs text-blue-600">
-                  <span className="w-1.5 h-1.5 bg-blue-500 rounded-full animate-pulse"></span>
-                  Refreshing
-                </span>
+        <div className="max-w-7xl mx-auto px-5 py-4">
+          <div className="flex items-center justify-between">
+            {/* Logo + Title + Description */}
+            <div className="flex items-center gap-4 min-w-0">
+              {logoUrl && (
+                <img
+                  src={logoUrl}
+                  alt={dashboardName}
+                  className="h-8 w-auto object-contain flex-shrink-0"
+                />
              )}
-          </div>
-
-          <div className="flex items-center gap-1.5">
-            {/* Reset Zoom button */}
-            {timeRangeStack.length > 0 && (
-              <Button
-                icon={IconProp.Refresh}
-                title="Reset Zoom"
-                buttonStyle={ButtonStyleType.HOVER_PRIMARY_OUTLINE}
-                onClick={() => {
-                  const previousRange: RangeStartAndEndDateTime | undefined =
-                    timeRangeStack[0];
-                  if (previousRange) {
-                    setStartAndEndDate(previousRange);
-                    setTimeRangeStack([]);
-                  }
-                }}
-                tooltip="Reset to original time range"
-              />
-            )}
-
-            {/* Auto-refresh dropdown */}
-            {hasComponents && (
-              <MoreMenu
-                menuIcon={IconProp.Refresh}
-                text={
-                  autoRefreshInterval !== AutoRefreshInterval.OFF
-                    ? getAutoRefreshIntervalLabel(autoRefreshInterval)
-                    : ""
-                }
-              >
-                {Object.values(AutoRefreshInterval).map(
-                  (interval: AutoRefreshInterval) => {
-                    return (
-                      <MoreMenuItem
-                        key={interval}
-                        text={getAutoRefreshIntervalLabel(interval)}
-                        onClick={() => {
-                          setAutoRefreshInterval(interval);
-                        }}
-                      />
-                    );
-                  },
+              <div className="min-w-0">
+                <h1 className="text-lg font-semibold text-gray-900 truncate">
+                  {dashboardName}
+                </h1>
+                {pageDescription && (
+                  <p className="text-sm text-gray-500 truncate">
+                    {pageDescription}
+                  </p>
                )}
-              </MoreMenu>
-            )}
-
-            <Button
-              icon={IconProp.Expand}
-              buttonStyle={ButtonStyleType.ICON}
-              onClick={() => {
-                const canvasElement: HTMLDivElement | null =
-                  dashboardCanvasRef.current;
-
-                if (!canvasElement) {
-                  return;
-                }
-
-                if (canvasElement.requestFullscreen) {
-                  canvasElement.requestFullscreen();
-                }
-              }}
-              tooltip="Full Screen"
-            />
-          </div>
-        </div>
-
-        {/* Bottom row: Time range + variables */}
-        {hasComponents && (
-          <div className="flex items-center gap-3 px-5 pb-3 pt-0 flex-wrap">
-            <div>
-              <RangeStartAndEndDateView
-                dashboardStartAndEndDate={startAndEndDate}
-                onChange={(newRange: RangeStartAndEndDateTime) => {
-                  setTimeRangeStack([...timeRangeStack, startAndEndDate]);
-                  setStartAndEndDate(newRange);
-                }}
-              />
+              </div>
            </div>

-            {dashboardVariables.length > 0 && (
-              <>
-                <div className="w-px h-5 bg-gray-200"></div>
-                <DashboardVariableSelector
-                  variables={dashboardVariables}
-                  onVariableValueChange={(
-                    variableId: string,
-                    value: string,
-                  ) => {
-                    setDashboardVariables(
-                      dashboardVariables.map((v: DashboardVariable) => {
-                        if (v.id === variableId) {
-                          return { ...v, currentValue: value };
-                        }
-                        return v;
-                      }),
-                    );
+            {/* Controls */}
+            <div className="flex items-center gap-2 flex-shrink-0">
+              {isRefreshing &&
+                autoRefreshInterval !== AutoRefreshInterval.OFF && (
+                  <span className="inline-flex items-center gap-1.5 text-xs text-blue-600">
+                    <span className="w-1.5 h-1.5 bg-blue-500 rounded-full animate-pulse"></span>
+                    Refreshing
+                  </span>
+                )}
+
+              {timeRangeStack.length > 0 && (
+                <Button
+                  icon={IconProp.Refresh}
+                  title="Reset Zoom"
+                  buttonStyle={ButtonStyleType.HOVER_PRIMARY_OUTLINE}
+                  onClick={() => {
+                    const previousRange: RangeStartAndEndDateTime | undefined =
+                      timeRangeStack[0];
+                    if (previousRange) {
+                      setStartAndEndDate(previousRange);
+                      setTimeRangeStack([]);
+                    }
+                  }}
+                  tooltip="Reset to original time range"
+                />
+              )}
+
+              {hasComponents && (
+                <MoreMenu
+                  menuIcon={IconProp.Refresh}
+                  text={
+                    autoRefreshInterval !== AutoRefreshInterval.OFF
+                      ? getAutoRefreshIntervalLabel(autoRefreshInterval)
+                      : ""
+                  }
+                >
+                  {Object.values(AutoRefreshInterval).map(
+                    (interval: AutoRefreshInterval) => {
+                      return (
+                        <MoreMenuItem
+                          key={interval}
+                          text={getAutoRefreshIntervalLabel(interval)}
+                          onClick={() => {
+                            setAutoRefreshInterval(interval);
+                          }}
+                        />
+                      );
+                    },
+                  )}
+                </MoreMenu>
+              )}
+
+              <Button
+                icon={IconProp.Expand}
+                buttonStyle={ButtonStyleType.ICON}
+                onClick={() => {
+                  const canvasElement: HTMLDivElement | null =
+                    dashboardCanvasRef.current;
+
+                  if (!canvasElement) {
+                    return;
+                  }
+
+                  if (canvasElement.requestFullscreen) {
+                    canvasElement.requestFullscreen();
+                  }
+                }}
+                tooltip="Full Screen"
+              />
+            </div>
+          </div>
+
+          {/* Time range + variables row */}
+          {hasComponents && (
+            <div className="flex items-center gap-3 mt-3 flex-wrap">
+              <div>
+                <RangeStartAndEndDateView
+                  dashboardStartAndEndDate={startAndEndDate}
+                  onChange={(newRange: RangeStartAndEndDateTime) => {
+                    setTimeRangeStack([...timeRangeStack, startAndEndDate]);
+                    setStartAndEndDate(newRange);
                  }}
                />
-              </>
-            )}
-          </div>
-        )}
+              </div>
+
+              {dashboardVariables.length > 0 && (
+                <>
+                  <div className="w-px h-5 bg-gray-200"></div>
+                  <DashboardVariableSelector
+                    variables={dashboardVariables}
+                    onVariableValueChange={(
+                      variableId: string,
+                      value: string,
+                    ) => {
+                      setDashboardVariables(
+                        dashboardVariables.map((v: DashboardVariable) => {
+                          if (v.id === variableId) {
+                            return { ...v, currentValue: value };
+                          }
+                          return v;
+                        }),
+                      );
+                    }}
+                  />
+                </>
+              )}
+            </div>
+          )}
+        </div>
      </div>

-      <div ref={dashboardCanvasRef}>
+      {/* Dashboard Canvas */}
+      <div ref={dashboardCanvasRef} className="mt-3">
        <DashboardCanvas
          dashboardViewConfig={dashboardViewConfig}
          onDashboardViewConfigChange={(_config: DashboardViewConfig) => {
@@ -423,7 +384,7 @@ const DashboardViewPage: FunctionComponent<ComponentProps> = (
      </div>

      {/* Footer */}
-      <div className="max-w-5xl mx-auto px-3 sm:px-5 py-5">
+      <div className="max-w-7xl mx-auto px-5 py-5">
        <div className="flex items-center justify-center text-xs text-gray-400">
          <span>Powered by</span>
          <a
--- a/Common/Server/API/DashboardAPI.ts
+++ b/Common/Server/API/DashboardAPI.ts
@@ -18,6 +18,7 @@ import HashedString from "../../Types/HashedString";
 import ObjectID from "../../Types/ObjectID";
 import Dashboard from "../../Models/DatabaseModels/Dashboard";
 import DashboardDomain from "../../Models/DatabaseModels/DashboardDomain";
+import File from "../../Models/DatabaseModels/File";
 import { EncryptionSecret } from "../EnvironmentConfig";
 import { DASHBOARD_MASTER_PASSWORD_INVALID_MESSAGE } from "../../Types/Dashboard/MasterPassword";
 import NotAuthenticatedException from "../../Types/Exception/NotAuthenticatedException";
@@ -214,12 +215,16 @@ export default class DashboardAPI extends BaseAPI<
            pageDescription: dashboard.pageDescription || "",
            logoFile: dashboard.logoFile
              ? JSONFunctions.serialize(
-                  dashboard.logoFile.toJSON() as any,
+                  dashboard.logoFile instanceof File
+                    ? (dashboard.logoFile.toJSON() as any)
+                    : (dashboard.logoFile as any),
                )
              : null,
            faviconFile: dashboard.faviconFile
              ? JSONFunctions.serialize(
-                  dashboard.faviconFile.toJSON() as any,
+                  dashboard.faviconFile instanceof File
+                    ? (dashboard.faviconFile.toJSON() as any)
+                    : (dashboard.faviconFile as any),
                )
              : null,
          });
@@ -289,7 +294,9 @@ export default class DashboardAPI extends BaseAPI<
            pageDescription: dashboard.pageDescription || "",
            logoFile: dashboard.logoFile
              ? JSONFunctions.serialize(
-                  dashboard.logoFile.toJSON() as any,
+                  dashboard.logoFile instanceof File
+                    ? (dashboard.logoFile.toJSON() as any)
+                    : (dashboard.logoFile as any),
                )
              : null,
            dashboardViewConfig: dashboard.dashboardViewConfig
--- a/Telemetry/Docs/opentelemetry-profiles-roadmap.md
+++ b/Telemetry/Docs/opentelemetry-profiles-roadmap.md
@@ -0,0 +1,452 @@
+# OpenTelemetry Profiles: Implementation Roadmap for OneUptime
+
+## Overview
+
+OpenTelemetry Profiles is the fourth core observability signal (joining traces, metrics, and logs), providing a unified standard for continuous production profiling. As of March 2026, it has reached **Public Alpha** status. This document outlines how OneUptime can add first-class support for ingesting, storing, querying, and visualizing profiling data.
+
+Reference: https://opentelemetry.io/blog/2026/profiles-alpha/
+
+---
+
+## Why Profiles Matter for OneUptime
+
+- **Complete Observability**: Profiles fill the gap between "what happened" (traces/logs) and "why it was slow" (CPU/memory/allocation hotspots).
+- **Cross-Signal Correlation**: Profile samples carry `trace_id` and `span_id`, enabling direct linkage from a slow span to the exact flamegraph showing where time was spent.
+- **Cost Optimization**: Customers can use profiles to identify wasteful code paths and reduce compute costs.
+- **Competitive Parity**: Major vendors (Datadog, Grafana, Elastic) are actively building OTLP Profiles support.
+
+---
+
+## Current Architecture (Context)
+
+OneUptime already ingests three OTel signals through a consistent pipeline:
+
+```
+Client --> gRPC (4317) / HTTP (/otlp/v1/{signal})
+       --> OtelRequestMiddleware (protobuf/JSON decode)
+       --> TelemetryIngest Middleware (auth)
+       --> 202 Accepted (immediate response)
+       --> Bull MQ Queue (async)
+       --> Ingest Service (batch processing)
+       --> ClickHouse (MergeTree tables)
+```
+
+Key files to reference:
+- Ingestion endpoints: `Telemetry/API/OTelIngest.ts`
+- gRPC server: `Telemetry/GrpcServer.ts`
+- Proto files: `Telemetry/ProtoFiles/OTel/v1/`
+- Analytics models: `Common/Models/AnalyticsModels/`
+- Queue services: `Telemetry/Services/Queue/`
+
+The Profiles implementation should follow this exact same pattern for consistency.
+
+---
+
+## Phase 1: Protocol & Ingestion Layer
+
+**Goal**: Accept OTLP Profiles data over gRPC and HTTP.
+
+### 1.1 Add Protobuf Definitions
+
+Add the profiles proto files to `Telemetry/ProtoFiles/OTel/v1/`:
+
+- `profiles.proto` — Core profiles data model (from `opentelemetry/proto/profiles/v1development/profiles.proto`)
+- `profiles_service.proto` — ProfilesService with `Export` RPC
+
+The OTLP Profiles format uses a **deduplicated stack representation** where each unique callstack is stored once, with dictionary tables for common entities (functions, locations, mappings). Key message types:
+
+```protobuf
+message ExportProfilesServiceRequest {
+  repeated ResourceProfiles resource_profiles = 1;
+}
+
+message ResourceProfiles {
+  Resource resource = 1;
+  repeated ScopeProfiles scope_profiles = 2;
+  string schema_url = 3;
+}
+
+message ScopeProfiles {
+  InstrumentationScope scope = 1;
+  repeated ProfileContainer profiles = 2;
+}
+
+message ProfileContainer {
+  bytes profile_id = 1;
+  int64 start_time_unix_nano = 2;
+  int64 end_time_unix_nano = 3;
+  Profile profile = 5;
+  // ...attributes, dropped_attributes_count
+}
+
+message Profile {
+  // Dictionary tables for deduplication
+  repeated ValueType sample_type = 1;
+  repeated Sample sample = 2;
+  repeated Location location = 4;
+  repeated Function function = 5;
+  repeated Mapping mapping = 3;
+  repeated AttributeUnit attribute_units = 15;
+  repeated Link link_table = 16;
+  repeated string string_table = 6;
+  // ...
+}
+```
+
+### 1.2 Register HTTP Endpoint
+
+In `Telemetry/API/OTelIngest.ts`, add:
+
+```
+POST /otlp/v1/profiles
+```
+
+Follow the same pattern as traces/metrics/logs:
+1. Parse protobuf or JSON body via `OtelRequestMiddleware`
+2. Authenticate via `TelemetryIngest` middleware
+3. Return 202 immediately
+4. Queue for async processing
+
+### 1.3 Register gRPC Service
+
+In `Telemetry/GrpcServer.ts`, register the `ProfilesService/Export` RPC handler alongside the existing trace/metrics/logs handlers.
+
+### 1.4 Update OTel Collector Config
+
+In `OTelCollector/otel-collector-config.template.yaml`, add a `profiles` pipeline:
+
+```yaml
+service:
+  pipelines:
+    profiles:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [otlphttp]
+```
+
+### Estimated Effort: 1-2 weeks
+
+---
+
+## Phase 2: Data Model & ClickHouse Storage
+
+**Goal**: Design an efficient ClickHouse schema for profile data.
+
+### 2.1 Design the Analytics Model
+
+Create `Common/Models/AnalyticsModels/Profile.ts` following the pattern of `Span.ts`, `Metric.ts`, `Log.ts`.
+
+**Proposed ClickHouse Table: `profile`**
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `projectId` | String (ObjectID) | Tenant ID |
+| `serviceId` | String (ObjectID) | Service reference |
+| `profileId` | String | Unique profile identifier |
+| `traceId` | String | Correlation with traces |
+| `spanId` | String | Correlation with spans |
+| `startTime` | DateTime64(9) | Profile start timestamp |
+| `endTime` | DateTime64(9) | Profile end timestamp |
+| `duration` | UInt64 | Duration in nanoseconds |
+| `profileType` | String | e.g., `cpu`, `wall`, `alloc_objects`, `alloc_space`, `goroutine` |
+| `unit` | String | e.g., `nanoseconds`, `bytes`, `count` |
+| `periodType` | String | Sampling period type |
+| `period` | Int64 | Sampling period value |
+| `attributes` | String (JSON) | Profile-level attributes |
+| `resourceAttributes` | String (JSON) | Resource attributes |
+
+**Proposed ClickHouse Table: `profile_sample`**
+
+This is the high-volume table storing individual samples (denormalized for query performance):
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `projectId` | String (ObjectID) | Tenant ID |
+| `serviceId` | String (ObjectID) | Service reference |
+| `profileId` | String | FK to profile table |
+| `traceId` | String | Trace correlation |
+| `spanId` | String | Span correlation |
+| `time` | DateTime64(9) | Sample timestamp |
+| `stacktrace` | Array(String) | Fully-resolved stack frames (function@file:line) |
+| `stacktraceHash` | String | Hash of stacktrace for grouping |
+| `value` | Int64 | Sample value (CPU time, bytes, count) |
+| `profileType` | String | Denormalized for filtering |
+| `labels` | String (JSON) | Sample-level labels |
+
+**Table Engine & Indexing:**
+- Engine: `MergeTree`
+- Partition by: `toYYYYMMDD(time)`
+- Primary key: `(projectId, serviceId, time)`
+- Order by: `(projectId, serviceId, time, profileType, stacktraceHash)`
+- TTL: `time + INTERVAL dataRetentionInDays DAY`
+- Skip indexes on `profileType`, `traceId`, `stacktraceHash`
+
+### 2.2 Storage Considerations
+
+**Why two tables?**
+- The `profile` table stores metadata and is low-volume — used for listing/filtering profiles.
+- The `profile_sample` table stores denormalized samples — high-volume but optimized for flamegraph aggregation queries.
+- Alternative: A single table with nested arrays for samples. This is more storage-efficient but makes aggregation queries harder. Start with two tables and revisit if needed.
+
+**Denormalization strategy:**
+The OTLP Profiles wire format uses dictionary-based deduplication (string tables, function tables, location tables). At ingestion time, we should **resolve all references** and store fully-materialized stack frames. This trades storage space for query simplicity — the same approach used for span attributes today.
+
+**Expected data volume:**
+- A typical eBPF profiler generates ~10-100 samples/second per process
+- Each sample with a 20-frame stack ≈ 1-2 KB denormalized
+- For 100 services, ~100K-1M samples/minute
+- ClickHouse compression (LZ4) reduces this significantly, especially with sorted stacktrace hashes
+
+### 2.3 Create Database Service
+
+Create `Common/Server/Services/ProfileService.ts` extending `AnalyticsDatabaseService<Profile>`.
+
+### Estimated Effort: 2-3 weeks
+
+---
+
+## Phase 3: Ingestion Service
+
+**Goal**: Process OTLP Profiles payloads and write to ClickHouse.
+
+### 3.1 Create Ingest Service
+
+Create `Telemetry/Services/OtelProfilesIngestService.ts` extending `OtelIngestBaseService`:
+
+```typescript
+class OtelProfilesIngestService extends OtelIngestBaseService {
+  // Entry point
+  async ingestProfiles(request: ExportProfilesServiceRequest): Promise<void>;
+
+  // Denormalize OTLP profile data:
+  // 1. Resolve string_table references
+  // 2. Resolve function/location/mapping references
+  // 3. Build fully-qualified stack frames per sample
+  // 4. Extract trace_id/span_id for correlation
+  // 5. Buffer and batch-insert into ClickHouse
+  async processProfile(profile: ProfileContainer, resource: Resource): Promise<void>;
+
+  // Flush buffer (batch size: 500 samples)
+  async flushProfilesBuffer(): Promise<void>;
+}
+```
+
+### 3.2 Create Queue Service
+
+Create `Telemetry/Services/Queue/ProfilesQueueService.ts`:
+- Add `TelemetryType.Profiles` enum value
+- Register queue handler in `Telemetry/Jobs/TelemetryIngest/ProcessTelemetry.ts`
+- Batch size: 500 (start conservative, tune later)
+
+### 3.3 Key Implementation Details
+
+**Denormalization logic** (the hardest part of this phase):
+
+The OTLP Profile message uses dictionary tables for compression. The ingestion service must resolve these:
+
+```
+For each sample in profile.sample:
+  For each location_index in sample.location_index:
+    location = profile.location[location_index]
+    For each line in location.line:
+      function = profile.function[line.function_index]
+      function_name = profile.string_table[function.name]
+      file_name = profile.string_table[function.filename]
+      frame = "${function_name}@${file_name}:${line.line}"
+    Build stacktrace array from frames
+  Compute stacktrace_hash = hash(stacktrace)
+  Extract value from sample.value[type_index]
+  Write denormalized row to buffer
+```
+
+**pprof interoperability:**
+Store enough metadata to reconstruct pprof format for export. The OTLP Profiles format supports round-trip conversion to/from pprof with no information loss.
+
+### Estimated Effort: 2-3 weeks
+
+---
+
+## Phase 4: Query API
+
+**Goal**: Expose APIs for querying and aggregating profile data.
+
+### 4.1 Core Query Endpoints
+
+Add to the telemetry API router:
+
+| Endpoint | Purpose |
+|----------|---------|
+| `GET /profiles` | List profiles with filters (service, time range, profile type) |
+| `GET /profiles/:profileId` | Get profile metadata |
+| `GET /profiles/:profileId/flamegraph` | Aggregated flamegraph data for a single profile |
+| `GET /profiles/aggregate/flamegraph` | Aggregated flamegraph across multiple profiles (time range) |
+| `GET /profiles/function-list` | Top functions by self/total time |
+| `GET /profiles/diff` | Diff flamegraph between two time ranges |
+
+### 4.2 Flamegraph Aggregation Query
+
+The core query for flamegraph rendering in ClickHouse:
+
+```sql
+SELECT
+  stacktrace,
+  SUM(value) as total_value
+FROM profile_sample
+WHERE projectId = {projectId}
+  AND serviceId = {serviceId}
+  AND time BETWEEN {startTime} AND {endTime}
+  AND profileType = {profileType}
+GROUP BY stacktrace
+ORDER BY total_value DESC
+LIMIT 10000
+```
+
+The API layer then builds a tree structure from flat stacktraces for the frontend flamegraph component.
+
+### 4.3 Cross-Signal Correlation Queries
+
+Leverage `traceId`/`spanId` columns for correlation:
+
+```sql
+-- Get profile samples for a specific trace
+SELECT stacktrace, SUM(value) as total_value
+FROM profile_sample
+WHERE projectId = {projectId}
+  AND traceId = {traceId}
+GROUP BY stacktrace
+
+-- Get profile samples for a specific span
+SELECT stacktrace, SUM(value) as total_value
+FROM profile_sample
+WHERE projectId = {projectId}
+  AND spanId = {spanId}
+GROUP BY stacktrace
+```
+
+This enables a "View Profile" button on the trace detail page.
+
+### Estimated Effort: 2 weeks
+
+---
+
+## Phase 5: Frontend — Profiles UI
+
+**Goal**: Build the profiles exploration and visualization UI.
+
+### 5.1 New Pages & Routes
+
+Add to `App/FeatureSet/Dashboard/src/`:
+
+- `Pages/Profiles/ProfileList.tsx` — List/search profiles by service, time range, type
+- `Pages/Profiles/ProfileDetail.tsx` — Single profile detail view
+- `Routes/ProfilesRoutes.tsx` — Route definitions
+
+### 5.2 Core Components
+
+| Component | Purpose |
+|-----------|---------|
+| `Components/Profiles/FlameGraph.tsx` | Interactive flamegraph (CPU/memory/alloc). Consider using an existing open-source flamegraph library (e.g., `speedscope` or `d3-flame-graph`) |
+| `Components/Profiles/FunctionList.tsx` | Table of functions sorted by self/total time with search |
+| `Components/Profiles/ProfileTypeSelector.tsx` | Dropdown to select profile type (CPU, heap, goroutine, etc.) |
+| `Components/Profiles/DiffFlameGraph.tsx` | Side-by-side or differential flamegraph comparing two time ranges |
+| `Components/Profiles/ProfileTimeline.tsx` | Timeline showing profile sample density over time |
+
+### 5.3 Cross-Signal Integration
+
+- **Trace Detail Page**: Add a "Profile" tab/button on `TraceExplorer.tsx` that links to the flamegraph filtered by `traceId`.
+- **Span Detail**: When viewing a span, show an inline flamegraph if profile samples exist for that `spanId`.
+- **Service Overview**: Add a "Profiles" tab on the service detail page showing aggregated flamegraphs.
+
+### 5.4 Navigation
+
+Add "Profiles" to the dashboard sidebar navigation alongside Traces, Metrics, and Logs.
+
+### Estimated Effort: 3-4 weeks
+
+---
+
+## Phase 6: Production Hardening
+
+**Goal**: Make the implementation production-ready.
+
+### 6.1 Data Retention & Billing
+
+- Add `profileRetentionInDays` to service-level settings (alongside existing telemetry retention)
+- Add billing metering for profile sample ingestion (samples/month)
+- Apply TTL rules on ClickHouse tables
+
+### 6.2 Performance Optimization
+
+- **Materialized Views**: Pre-aggregate top functions per service per hour for fast dashboard loading
+- **Sampling**: For high-volume services, support server-side downsampling of profile data
+- **Compression**: Evaluate dictionary encoding for `stacktrace` column (high repetition rate)
+- **Query Caching**: Cache aggregated flamegraph results for repeated time ranges
+
+### 6.3 Alerting Integration
+
+- Allow alerting on profile metrics (e.g., "alert when function X exceeds Y% of CPU")
+- Surface profile data in incident timelines
+
+### 6.4 pprof Export
+
+- Add `GET /profiles/:profileId/pprof` endpoint that converts stored data back to pprof format
+- Enables users to download and analyze profiles with existing tools (go tool pprof, etc.)
+
+### Estimated Effort: 2-3 weeks
+
+---
+
+## Phase 7: Documentation & Launch
+
+### 7.1 User-Facing Docs
+
+Add `App/FeatureSet/Docs/Content/telemetry/profiles.md`:
+- How to instrument your application for continuous profiling
+- Configuring the OTel eBPF profiler agent
+- Configuring async-profiler (Java) with OTLP export
+- Viewing profiles in OneUptime
+- Cross-signal correlation (profiles + traces)
+
+### 7.2 Example Data
+
+Add `Telemetry/Docs/profileData.example.json` with a sample OTLP Profiles payload.
+
+### Estimated Effort: 1 week
+
+---
+
+## Summary Timeline
+
+| Phase | Description | Effort | Dependencies |
+|-------|-------------|--------|--------------|
+| 1 | Protocol & Ingestion Layer | 1-2 weeks | None |
+| 2 | Data Model & ClickHouse Storage | 2-3 weeks | Phase 1 |
+| 3 | Ingestion Service | 2-3 weeks | Phase 1, 2 |
+| 4 | Query API | 2 weeks | Phase 2, 3 |
+| 5 | Frontend — Profiles UI | 3-4 weeks | Phase 4 |
+| 6 | Production Hardening | 2-3 weeks | Phase 5 |
+| 7 | Documentation & Launch | 1 week | Phase 6 |
+
+**Total estimated effort: 13-19 weeks** (with parallelization of phases 4+5, closer to 10-14 weeks)
+
+---
+
+## Key Risks & Mitigations
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| OTLP Profiles is still Alpha — proto schema may change | Breaking changes to ingestion | Pin to specific OTLP proto version (v1.10.0+), add version detection |
+| High storage volume from continuous profiling | ClickHouse disk/cost growth | Server-side sampling, aggressive TTL defaults (7 days), compression tuning |
+| Flamegraph rendering performance with large profiles | Slow UI | Limit to top 10K stacktraces, lazy-load deep frames, pre-aggregate |
+| Denormalization complexity in ingestion | Bugs, data loss | Extensive unit tests with real pprof data, conformance checker validation |
+| Limited client-side instrumentation maturity | Low adoption | Start with eBPF profiler (no code changes needed), expand as ecosystem matures |
+
+---
+
+## References
+
+- [OTel Profiles Alpha Blog Post](https://opentelemetry.io/blog/2026/profiles-alpha/)
+- [OTLP Profiles Proto](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/profiles/v1development/profiles.proto)
+- [OTel eBPF Profiling Agent](https://github.com/open-telemetry/opentelemetry-ebpf-profiler)
+- [pprof Format](https://github.com/google/pprof)
+- [OTel Semantic Conventions for Profiles](https://opentelemetry.io/docs/specs/semconv/general/profiles/)