feat: Implement OpenTelemetry Profiles roadmap documentation

This commit is contained in:
Nawaz Dhandala
2026-03-26 21:22:15 +00:00
parent 4b339f07ec
commit 50717e5167
3 changed files with 594 additions and 174 deletions

View File

@@ -34,12 +34,6 @@ import MoreMenuItem from "Common/UI/Components/MoreMenu/MoreMenuItem";
import IconProp from "Common/Types/Icon/IconProp";
import Button, { ButtonStyleType } from "Common/UI/Components/Button/Button";
import DashboardVariableSelector from "./DashboardVariableSelector";
import NavBar from "Common/UI/Components/Navbar/NavBar";
import NavBarItem from "Common/UI/Components/Navbar/NavBarItem";
import PageMap from "../../Utils/PageMap";
import RouteMap, { RouteUtil } from "../../Utils/RouteMap";
import PublicDashboardUtil from "../../Utils/PublicDashboard";
import Route from "Common/Types/API/Route";
export interface ComponentProps {
dashboardId: ObjectID;
@@ -216,14 +210,6 @@ const DashboardViewPage: FunctionComponent<ComponentProps> = (
return <PageLoader isVisible={true} />;
}
const isPreview: boolean = PublicDashboardUtil.isPreviewPage();
const overviewRoute: Route = RouteUtil.populateRouteParams(
isPreview
? (RouteMap[PageMap.PREVIEW_OVERVIEW] as Route)
: (RouteMap[PageMap.OVERVIEW] as Route),
);
return (
<div
ref={dashboardViewRef}
@@ -233,174 +219,149 @@ const DashboardViewPage: FunctionComponent<ComponentProps> = (
background: "#fafbfc",
}}
>
{/* Header and NavBar */}
<div className="max-w-5xl mx-auto px-3 sm:px-5">
<div className="flex items-center gap-4 mt-5">
{logoUrl && (
<img
src={logoUrl}
alt={dashboardName}
className="h-10 w-auto object-contain"
/>
)}
<div>
<h1 className="text-xl font-semibold text-gray-900 truncate">
{dashboardName}
</h1>
{pageDescription && (
<p className="text-sm text-gray-500 mt-0.5 truncate">
{pageDescription}
</p>
)}
</div>
</div>
<NavBar className="bg-white flex text-center justify-between py-2 mt-5 rounded-lg shadow px-5">
<NavBarItem
id="overview-nav-bar-item"
title="Overview"
icon={IconProp.CheckCircle}
exact={true}
route={overviewRoute}
/>
</NavBar>
</div>
{/* Public Dashboard Toolbar */}
{/* Header */}
<div
className="mx-3 mt-3 mb-2 rounded-lg bg-white border border-gray-200"
className="bg-white border-b border-gray-200"
style={{
boxShadow:
"0 1px 3px 0 rgba(0, 0, 0, 0.05), 0 1px 2px -1px rgba(0, 0, 0, 0.04)",
boxShadow: "0 1px 3px 0 rgba(0, 0, 0, 0.05)",
}}
>
<div
className="h-0.5 rounded-t-lg"
style={{
background: "linear-gradient(90deg, #6366f1 0%, #8b5cf6 100%)",
}}
></div>
<div className="flex items-center justify-between px-5 py-3">
<div className="flex items-center gap-3 min-w-0">
{hasComponents && (
<span className="text-xs text-gray-400 tabular-nums">
{dashboardViewConfig.components.length} widget
{dashboardViewConfig.components.length !== 1 ? "s" : ""}
</span>
)}
{isRefreshing &&
autoRefreshInterval !== AutoRefreshInterval.OFF && (
<span className="inline-flex items-center gap-1.5 text-xs text-blue-600">
<span className="w-1.5 h-1.5 bg-blue-500 rounded-full animate-pulse"></span>
Refreshing
</span>
<div className="max-w-7xl mx-auto px-5 py-4">
<div className="flex items-center justify-between">
{/* Logo + Title + Description */}
<div className="flex items-center gap-4 min-w-0">
{logoUrl && (
<img
src={logoUrl}
alt={dashboardName}
className="h-8 w-auto object-contain flex-shrink-0"
/>
)}
</div>
<div className="flex items-center gap-1.5">
{/* Reset Zoom button */}
{timeRangeStack.length > 0 && (
<Button
icon={IconProp.Refresh}
title="Reset Zoom"
buttonStyle={ButtonStyleType.HOVER_PRIMARY_OUTLINE}
onClick={() => {
const previousRange: RangeStartAndEndDateTime | undefined =
timeRangeStack[0];
if (previousRange) {
setStartAndEndDate(previousRange);
setTimeRangeStack([]);
}
}}
tooltip="Reset to original time range"
/>
)}
{/* Auto-refresh dropdown */}
{hasComponents && (
<MoreMenu
menuIcon={IconProp.Refresh}
text={
autoRefreshInterval !== AutoRefreshInterval.OFF
? getAutoRefreshIntervalLabel(autoRefreshInterval)
: ""
}
>
{Object.values(AutoRefreshInterval).map(
(interval: AutoRefreshInterval) => {
return (
<MoreMenuItem
key={interval}
text={getAutoRefreshIntervalLabel(interval)}
onClick={() => {
setAutoRefreshInterval(interval);
}}
/>
);
},
<div className="min-w-0">
<h1 className="text-lg font-semibold text-gray-900 truncate">
{dashboardName}
</h1>
{pageDescription && (
<p className="text-sm text-gray-500 truncate">
{pageDescription}
</p>
)}
</MoreMenu>
)}
<Button
icon={IconProp.Expand}
buttonStyle={ButtonStyleType.ICON}
onClick={() => {
const canvasElement: HTMLDivElement | null =
dashboardCanvasRef.current;
if (!canvasElement) {
return;
}
if (canvasElement.requestFullscreen) {
canvasElement.requestFullscreen();
}
}}
tooltip="Full Screen"
/>
</div>
</div>
{/* Bottom row: Time range + variables */}
{hasComponents && (
<div className="flex items-center gap-3 px-5 pb-3 pt-0 flex-wrap">
<div>
<RangeStartAndEndDateView
dashboardStartAndEndDate={startAndEndDate}
onChange={(newRange: RangeStartAndEndDateTime) => {
setTimeRangeStack([...timeRangeStack, startAndEndDate]);
setStartAndEndDate(newRange);
}}
/>
</div>
</div>
{dashboardVariables.length > 0 && (
<>
<div className="w-px h-5 bg-gray-200"></div>
<DashboardVariableSelector
variables={dashboardVariables}
onVariableValueChange={(
variableId: string,
value: string,
) => {
setDashboardVariables(
dashboardVariables.map((v: DashboardVariable) => {
if (v.id === variableId) {
return { ...v, currentValue: value };
}
return v;
}),
);
{/* Controls */}
<div className="flex items-center gap-2 flex-shrink-0">
{isRefreshing &&
autoRefreshInterval !== AutoRefreshInterval.OFF && (
<span className="inline-flex items-center gap-1.5 text-xs text-blue-600">
<span className="w-1.5 h-1.5 bg-blue-500 rounded-full animate-pulse"></span>
Refreshing
</span>
)}
{timeRangeStack.length > 0 && (
<Button
icon={IconProp.Refresh}
title="Reset Zoom"
buttonStyle={ButtonStyleType.HOVER_PRIMARY_OUTLINE}
onClick={() => {
const previousRange: RangeStartAndEndDateTime | undefined =
timeRangeStack[0];
if (previousRange) {
setStartAndEndDate(previousRange);
setTimeRangeStack([]);
}
}}
tooltip="Reset to original time range"
/>
)}
{hasComponents && (
<MoreMenu
menuIcon={IconProp.Refresh}
text={
autoRefreshInterval !== AutoRefreshInterval.OFF
? getAutoRefreshIntervalLabel(autoRefreshInterval)
: ""
}
>
{Object.values(AutoRefreshInterval).map(
(interval: AutoRefreshInterval) => {
return (
<MoreMenuItem
key={interval}
text={getAutoRefreshIntervalLabel(interval)}
onClick={() => {
setAutoRefreshInterval(interval);
}}
/>
);
},
)}
</MoreMenu>
)}
<Button
icon={IconProp.Expand}
buttonStyle={ButtonStyleType.ICON}
onClick={() => {
const canvasElement: HTMLDivElement | null =
dashboardCanvasRef.current;
if (!canvasElement) {
return;
}
if (canvasElement.requestFullscreen) {
canvasElement.requestFullscreen();
}
}}
tooltip="Full Screen"
/>
</div>
</div>
{/* Time range + variables row */}
{hasComponents && (
<div className="flex items-center gap-3 mt-3 flex-wrap">
<div>
<RangeStartAndEndDateView
dashboardStartAndEndDate={startAndEndDate}
onChange={(newRange: RangeStartAndEndDateTime) => {
setTimeRangeStack([...timeRangeStack, startAndEndDate]);
setStartAndEndDate(newRange);
}}
/>
</>
)}
</div>
)}
</div>
{dashboardVariables.length > 0 && (
<>
<div className="w-px h-5 bg-gray-200"></div>
<DashboardVariableSelector
variables={dashboardVariables}
onVariableValueChange={(
variableId: string,
value: string,
) => {
setDashboardVariables(
dashboardVariables.map((v: DashboardVariable) => {
if (v.id === variableId) {
return { ...v, currentValue: value };
}
return v;
}),
);
}}
/>
</>
)}
</div>
)}
</div>
</div>
<div ref={dashboardCanvasRef}>
{/* Dashboard Canvas */}
<div ref={dashboardCanvasRef} className="mt-3">
<DashboardCanvas
dashboardViewConfig={dashboardViewConfig}
onDashboardViewConfigChange={(_config: DashboardViewConfig) => {
@@ -423,7 +384,7 @@ const DashboardViewPage: FunctionComponent<ComponentProps> = (
</div>
{/* Footer */}
<div className="max-w-5xl mx-auto px-3 sm:px-5 py-5">
<div className="max-w-7xl mx-auto px-5 py-5">
<div className="flex items-center justify-center text-xs text-gray-400">
<span>Powered by</span>
<a

View File

@@ -18,6 +18,7 @@ import HashedString from "../../Types/HashedString";
import ObjectID from "../../Types/ObjectID";
import Dashboard from "../../Models/DatabaseModels/Dashboard";
import DashboardDomain from "../../Models/DatabaseModels/DashboardDomain";
import File from "../../Models/DatabaseModels/File";
import { EncryptionSecret } from "../EnvironmentConfig";
import { DASHBOARD_MASTER_PASSWORD_INVALID_MESSAGE } from "../../Types/Dashboard/MasterPassword";
import NotAuthenticatedException from "../../Types/Exception/NotAuthenticatedException";
@@ -214,12 +215,16 @@ export default class DashboardAPI extends BaseAPI<
pageDescription: dashboard.pageDescription || "",
logoFile: dashboard.logoFile
? JSONFunctions.serialize(
dashboard.logoFile.toJSON() as any,
dashboard.logoFile instanceof File
? (dashboard.logoFile.toJSON() as any)
: (dashboard.logoFile as any),
)
: null,
faviconFile: dashboard.faviconFile
? JSONFunctions.serialize(
dashboard.faviconFile.toJSON() as any,
dashboard.faviconFile instanceof File
? (dashboard.faviconFile.toJSON() as any)
: (dashboard.faviconFile as any),
)
: null,
});
@@ -289,7 +294,9 @@ export default class DashboardAPI extends BaseAPI<
pageDescription: dashboard.pageDescription || "",
logoFile: dashboard.logoFile
? JSONFunctions.serialize(
dashboard.logoFile.toJSON() as any,
dashboard.logoFile instanceof File
? (dashboard.logoFile.toJSON() as any)
: (dashboard.logoFile as any),
)
: null,
dashboardViewConfig: dashboard.dashboardViewConfig

View File

@@ -0,0 +1,452 @@
# OpenTelemetry Profiles: Implementation Roadmap for OneUptime
## Overview
OpenTelemetry Profiles is the fourth core observability signal (joining traces, metrics, and logs), providing a unified standard for continuous production profiling. As of March 2026, it has reached **Public Alpha** status. This document outlines how OneUptime can add first-class support for ingesting, storing, querying, and visualizing profiling data.
Reference: https://opentelemetry.io/blog/2026/profiles-alpha/
---
## Why Profiles Matter for OneUptime
- **Complete Observability**: Profiles fill the gap between "what happened" (traces/logs) and "why it was slow" (CPU/memory/allocation hotspots).
- **Cross-Signal Correlation**: Profile samples carry `trace_id` and `span_id`, enabling direct linkage from a slow span to the exact flamegraph showing where time was spent.
- **Cost Optimization**: Customers can use profiles to identify wasteful code paths and reduce compute costs.
- **Competitive Parity**: Major vendors (Datadog, Grafana, Elastic) are actively building OTLP Profiles support.
---
## Current Architecture (Context)
OneUptime already ingests three OTel signals through a consistent pipeline:
```
Client --> gRPC (4317) / HTTP (/otlp/v1/{signal})
--> OtelRequestMiddleware (protobuf/JSON decode)
--> TelemetryIngest Middleware (auth)
--> 202 Accepted (immediate response)
--> Bull MQ Queue (async)
--> Ingest Service (batch processing)
--> ClickHouse (MergeTree tables)
```
Key files to reference:
- Ingestion endpoints: `Telemetry/API/OTelIngest.ts`
- gRPC server: `Telemetry/GrpcServer.ts`
- Proto files: `Telemetry/ProtoFiles/OTel/v1/`
- Analytics models: `Common/Models/AnalyticsModels/`
- Queue services: `Telemetry/Services/Queue/`
The Profiles implementation should follow this exact same pattern for consistency.
---
## Phase 1: Protocol & Ingestion Layer
**Goal**: Accept OTLP Profiles data over gRPC and HTTP.
### 1.1 Add Protobuf Definitions
Add the profiles proto files to `Telemetry/ProtoFiles/OTel/v1/`:
- `profiles.proto` — Core profiles data model (from `opentelemetry/proto/profiles/v1development/profiles.proto`)
- `profiles_service.proto` — ProfilesService with `Export` RPC
The OTLP Profiles format uses a **deduplicated stack representation** where each unique callstack is stored once, with dictionary tables for common entities (functions, locations, mappings). Key message types:
```protobuf
message ExportProfilesServiceRequest {
repeated ResourceProfiles resource_profiles = 1;
}
message ResourceProfiles {
Resource resource = 1;
repeated ScopeProfiles scope_profiles = 2;
string schema_url = 3;
}
message ScopeProfiles {
InstrumentationScope scope = 1;
repeated ProfileContainer profiles = 2;
}
message ProfileContainer {
bytes profile_id = 1;
int64 start_time_unix_nano = 2;
int64 end_time_unix_nano = 3;
Profile profile = 5;
// ...attributes, dropped_attributes_count
}
message Profile {
// Dictionary tables for deduplication
repeated ValueType sample_type = 1;
repeated Sample sample = 2;
repeated Location location = 4;
repeated Function function = 5;
repeated Mapping mapping = 3;
repeated AttributeUnit attribute_units = 15;
repeated Link link_table = 16;
repeated string string_table = 6;
// ...
}
```
### 1.2 Register HTTP Endpoint
In `Telemetry/API/OTelIngest.ts`, add:
```
POST /otlp/v1/profiles
```
Follow the same pattern as traces/metrics/logs:
1. Parse protobuf or JSON body via `OtelRequestMiddleware`
2. Authenticate via `TelemetryIngest` middleware
3. Return 202 immediately
4. Queue for async processing
### 1.3 Register gRPC Service
In `Telemetry/GrpcServer.ts`, register the `ProfilesService/Export` RPC handler alongside the existing trace/metrics/logs handlers.
### 1.4 Update OTel Collector Config
In `OTelCollector/otel-collector-config.template.yaml`, add a `profiles` pipeline:
```yaml
service:
pipelines:
profiles:
receivers: [otlp]
processors: [batch]
exporters: [otlphttp]
```
### Estimated Effort: 1-2 weeks
---
## Phase 2: Data Model & ClickHouse Storage
**Goal**: Design an efficient ClickHouse schema for profile data.
### 2.1 Design the Analytics Model
Create `Common/Models/AnalyticsModels/Profile.ts` following the pattern of `Span.ts`, `Metric.ts`, `Log.ts`.
**Proposed ClickHouse Table: `profile`**
| Column | Type | Description |
|--------|------|-------------|
| `projectId` | String (ObjectID) | Tenant ID |
| `serviceId` | String (ObjectID) | Service reference |
| `profileId` | String | Unique profile identifier |
| `traceId` | String | Correlation with traces |
| `spanId` | String | Correlation with spans |
| `startTime` | DateTime64(9) | Profile start timestamp |
| `endTime` | DateTime64(9) | Profile end timestamp |
| `duration` | UInt64 | Duration in nanoseconds |
| `profileType` | String | e.g., `cpu`, `wall`, `alloc_objects`, `alloc_space`, `goroutine` |
| `unit` | String | e.g., `nanoseconds`, `bytes`, `count` |
| `periodType` | String | Sampling period type |
| `period` | Int64 | Sampling period value |
| `attributes` | String (JSON) | Profile-level attributes |
| `resourceAttributes` | String (JSON) | Resource attributes |
**Proposed ClickHouse Table: `profile_sample`**
This is the high-volume table storing individual samples (denormalized for query performance):
| Column | Type | Description |
|--------|------|-------------|
| `projectId` | String (ObjectID) | Tenant ID |
| `serviceId` | String (ObjectID) | Service reference |
| `profileId` | String | FK to profile table |
| `traceId` | String | Trace correlation |
| `spanId` | String | Span correlation |
| `time` | DateTime64(9) | Sample timestamp |
| `stacktrace` | Array(String) | Fully-resolved stack frames (function@file:line) |
| `stacktraceHash` | String | Hash of stacktrace for grouping |
| `value` | Int64 | Sample value (CPU time, bytes, count) |
| `profileType` | String | Denormalized for filtering |
| `labels` | String (JSON) | Sample-level labels |
**Table Engine & Indexing:**
- Engine: `MergeTree`
- Partition by: `toYYYYMMDD(time)`
- Primary key: `(projectId, serviceId, time)`
- Order by: `(projectId, serviceId, time, profileType, stacktraceHash)`
- TTL: `time + INTERVAL dataRetentionInDays DAY`
- Skip indexes on `profileType`, `traceId`, `stacktraceHash`
### 2.2 Storage Considerations
**Why two tables?**
- The `profile` table stores metadata and is low-volume — used for listing/filtering profiles.
- The `profile_sample` table stores denormalized samples — high-volume but optimized for flamegraph aggregation queries.
- Alternative: A single table with nested arrays for samples. This is more storage-efficient but makes aggregation queries harder. Start with two tables and revisit if needed.
**Denormalization strategy:**
The OTLP Profiles wire format uses dictionary-based deduplication (string tables, function tables, location tables). At ingestion time, we should **resolve all references** and store fully-materialized stack frames. This trades storage space for query simplicity — the same approach used for span attributes today.
**Expected data volume:**
- A typical eBPF profiler generates ~10-100 samples/second per process
- Each sample with a 20-frame stack ≈ 1-2 KB denormalized
- For 100 services, ~100K-1M samples/minute
- ClickHouse compression (LZ4) reduces this significantly, especially with sorted stacktrace hashes
### 2.3 Create Database Service
Create `Common/Server/Services/ProfileService.ts` extending `AnalyticsDatabaseService<Profile>`.
### Estimated Effort: 2-3 weeks
---
## Phase 3: Ingestion Service
**Goal**: Process OTLP Profiles payloads and write to ClickHouse.
### 3.1 Create Ingest Service
Create `Telemetry/Services/OtelProfilesIngestService.ts` extending `OtelIngestBaseService`:
```typescript
class OtelProfilesIngestService extends OtelIngestBaseService {
// Entry point
async ingestProfiles(request: ExportProfilesServiceRequest): Promise<void>;
// Denormalize OTLP profile data:
// 1. Resolve string_table references
// 2. Resolve function/location/mapping references
// 3. Build fully-qualified stack frames per sample
// 4. Extract trace_id/span_id for correlation
// 5. Buffer and batch-insert into ClickHouse
async processProfile(profile: ProfileContainer, resource: Resource): Promise<void>;
// Flush buffer (batch size: 500 samples)
async flushProfilesBuffer(): Promise<void>;
}
```
### 3.2 Create Queue Service
Create `Telemetry/Services/Queue/ProfilesQueueService.ts`:
- Add `TelemetryType.Profiles` enum value
- Register queue handler in `Telemetry/Jobs/TelemetryIngest/ProcessTelemetry.ts`
- Batch size: 500 (start conservative, tune later)
### 3.3 Key Implementation Details
**Denormalization logic** (the hardest part of this phase):
The OTLP Profile message uses dictionary tables for compression. The ingestion service must resolve these:
```
For each sample in profile.sample:
For each location_index in sample.location_index:
location = profile.location[location_index]
For each line in location.line:
function = profile.function[line.function_index]
function_name = profile.string_table[function.name]
file_name = profile.string_table[function.filename]
frame = "${function_name}@${file_name}:${line.line}"
Build stacktrace array from frames
Compute stacktrace_hash = hash(stacktrace)
Extract value from sample.value[type_index]
Write denormalized row to buffer
```
**pprof interoperability:**
Store enough metadata to reconstruct pprof format for export. The OTLP Profiles format supports round-trip conversion to/from pprof with no information loss.
### Estimated Effort: 2-3 weeks
---
## Phase 4: Query API
**Goal**: Expose APIs for querying and aggregating profile data.
### 4.1 Core Query Endpoints
Add to the telemetry API router:
| Endpoint | Purpose |
|----------|---------|
| `GET /profiles` | List profiles with filters (service, time range, profile type) |
| `GET /profiles/:profileId` | Get profile metadata |
| `GET /profiles/:profileId/flamegraph` | Aggregated flamegraph data for a single profile |
| `GET /profiles/aggregate/flamegraph` | Aggregated flamegraph across multiple profiles (time range) |
| `GET /profiles/function-list` | Top functions by self/total time |
| `GET /profiles/diff` | Diff flamegraph between two time ranges |
### 4.2 Flamegraph Aggregation Query
The core query for flamegraph rendering in ClickHouse:
```sql
SELECT
stacktrace,
SUM(value) as total_value
FROM profile_sample
WHERE projectId = {projectId}
AND serviceId = {serviceId}
AND time BETWEEN {startTime} AND {endTime}
AND profileType = {profileType}
GROUP BY stacktrace
ORDER BY total_value DESC
LIMIT 10000
```
The API layer then builds a tree structure from flat stacktraces for the frontend flamegraph component.
### 4.3 Cross-Signal Correlation Queries
Leverage `traceId`/`spanId` columns for correlation:
```sql
-- Get profile samples for a specific trace
SELECT stacktrace, SUM(value) as total_value
FROM profile_sample
WHERE projectId = {projectId}
AND traceId = {traceId}
GROUP BY stacktrace
-- Get profile samples for a specific span
SELECT stacktrace, SUM(value) as total_value
FROM profile_sample
WHERE projectId = {projectId}
AND spanId = {spanId}
GROUP BY stacktrace
```
This enables a "View Profile" button on the trace detail page.
### Estimated Effort: 2 weeks
---
## Phase 5: Frontend — Profiles UI
**Goal**: Build the profiles exploration and visualization UI.
### 5.1 New Pages & Routes
Add to `App/FeatureSet/Dashboard/src/`:
- `Pages/Profiles/ProfileList.tsx` — List/search profiles by service, time range, type
- `Pages/Profiles/ProfileDetail.tsx` — Single profile detail view
- `Routes/ProfilesRoutes.tsx` — Route definitions
### 5.2 Core Components
| Component | Purpose |
|-----------|---------|
| `Components/Profiles/FlameGraph.tsx` | Interactive flamegraph (CPU/memory/alloc). Consider using an existing open-source flamegraph library (e.g., `speedscope` or `d3-flame-graph`) |
| `Components/Profiles/FunctionList.tsx` | Table of functions sorted by self/total time with search |
| `Components/Profiles/ProfileTypeSelector.tsx` | Dropdown to select profile type (CPU, heap, goroutine, etc.) |
| `Components/Profiles/DiffFlameGraph.tsx` | Side-by-side or differential flamegraph comparing two time ranges |
| `Components/Profiles/ProfileTimeline.tsx` | Timeline showing profile sample density over time |
### 5.3 Cross-Signal Integration
- **Trace Detail Page**: Add a "Profile" tab/button on `TraceExplorer.tsx` that links to the flamegraph filtered by `traceId`.
- **Span Detail**: When viewing a span, show an inline flamegraph if profile samples exist for that `spanId`.
- **Service Overview**: Add a "Profiles" tab on the service detail page showing aggregated flamegraphs.
### 5.4 Navigation
Add "Profiles" to the dashboard sidebar navigation alongside Traces, Metrics, and Logs.
### Estimated Effort: 3-4 weeks
---
## Phase 6: Production Hardening
**Goal**: Make the implementation production-ready.
### 6.1 Data Retention & Billing
- Add `profileRetentionInDays` to service-level settings (alongside existing telemetry retention)
- Add billing metering for profile sample ingestion (samples/month)
- Apply TTL rules on ClickHouse tables
### 6.2 Performance Optimization
- **Materialized Views**: Pre-aggregate top functions per service per hour for fast dashboard loading
- **Sampling**: For high-volume services, support server-side downsampling of profile data
- **Compression**: Evaluate dictionary encoding for `stacktrace` column (high repetition rate)
- **Query Caching**: Cache aggregated flamegraph results for repeated time ranges
### 6.3 Alerting Integration
- Allow alerting on profile metrics (e.g., "alert when function X exceeds Y% of CPU")
- Surface profile data in incident timelines
### 6.4 pprof Export
- Add `GET /profiles/:profileId/pprof` endpoint that converts stored data back to pprof format
- Enables users to download and analyze profiles with existing tools (go tool pprof, etc.)
### Estimated Effort: 2-3 weeks
---
## Phase 7: Documentation & Launch
### 7.1 User-Facing Docs
Add `App/FeatureSet/Docs/Content/telemetry/profiles.md`:
- How to instrument your application for continuous profiling
- Configuring the OTel eBPF profiler agent
- Configuring async-profiler (Java) with OTLP export
- Viewing profiles in OneUptime
- Cross-signal correlation (profiles + traces)
### 7.2 Example Data
Add `Telemetry/Docs/profileData.example.json` with a sample OTLP Profiles payload.
### Estimated Effort: 1 week
---
## Summary Timeline
| Phase | Description | Effort | Dependencies |
|-------|-------------|--------|--------------|
| 1 | Protocol & Ingestion Layer | 1-2 weeks | None |
| 2 | Data Model & ClickHouse Storage | 2-3 weeks | Phase 1 |
| 3 | Ingestion Service | 2-3 weeks | Phase 1, 2 |
| 4 | Query API | 2 weeks | Phase 2, 3 |
| 5 | Frontend — Profiles UI | 3-4 weeks | Phase 4 |
| 6 | Production Hardening | 2-3 weeks | Phase 5 |
| 7 | Documentation & Launch | 1 week | Phase 6 |
**Total estimated effort: 13-19 weeks** (with parallelization of phases 4+5, closer to 10-14 weeks)
---
## Key Risks & Mitigations
| Risk | Impact | Mitigation |
|------|--------|------------|
| OTLP Profiles is still Alpha — proto schema may change | Breaking changes to ingestion | Pin to specific OTLP proto version (v1.10.0+), add version detection |
| High storage volume from continuous profiling | ClickHouse disk/cost growth | Server-side sampling, aggressive TTL defaults (7 days), compression tuning |
| Flamegraph rendering performance with large profiles | Slow UI | Limit to top 10K stacktraces, lazy-load deep frames, pre-aggregate |
| Denormalization complexity in ingestion | Bugs, data loss | Extensive unit tests with real pprof data, conformance checker validation |
| Limited client-side instrumentation maturity | Low adoption | Start with eBPF profiler (no code changes needed), expand as ecosystem matures |
---
## References
- [OTel Profiles Alpha Blog Post](https://opentelemetry.io/blog/2026/profiles-alpha/)
- [OTLP Profiles Proto](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/profiles/v1development/profiles.proto)
- [OTel eBPF Profiling Agent](https://github.com/open-telemetry/opentelemetry-ebpf-profiler)
- [pprof Format](https://github.com/google/pprof)
- [OTel Semantic Conventions for Profiles](https://opentelemetry.io/docs/specs/semconv/general/profiles/)