diff --git a/App/FeatureSet/Dashboard/src/Components/Dashboard/Components/DashboardBaseComponent.tsx b/App/FeatureSet/Dashboard/src/Components/Dashboard/Components/DashboardBaseComponent.tsx index fef88091d0..afad6ab787 100644 --- a/App/FeatureSet/Dashboard/src/Components/Dashboard/Components/DashboardBaseComponent.tsx +++ b/App/FeatureSet/Dashboard/src/Components/Dashboard/Components/DashboardBaseComponent.tsx @@ -59,9 +59,15 @@ export interface ComponentProps extends DashboardBaseComponentProps { onClick: () => void; } -type InteractionMode = "idle" | "moving" | "resizing-width" | "resizing-height" | "resizing-corner"; +type InteractionMode = + | "idle" + | "moving" + | "resizing-width" + | "resizing-height" + | "resizing-corner"; interface DragState { + mode: InteractionMode; startMouseX: number; startMouseY: number; startComponentTop: number; @@ -75,205 +81,199 @@ const DashboardBaseComponentElement: FunctionComponent = ( ): ReactElement => { const component: DashboardBaseComponent = props.dashboardViewConfig.components.find( - (component: DashboardBaseComponent) => { - return ( - component.componentId.toString() === props.componentId.toString() - ); + (c: DashboardBaseComponent) => { + return c.componentId.toString() === props.componentId.toString(); }, ) as DashboardBaseComponent; const widthOfComponent: number = component.widthInDashboardUnits; const heightOfComponent: number = component.heightInDashboardUnits; - const [interactionMode, setInteractionMode] = useState("idle"); + const [interactionMode, setInteractionMode] = + useState("idle"); const [isHovered, setIsHovered] = useState(false); - const dragStateRef: React.MutableRefObject = useRef(null); + + // Refs to hold mutable values the mouse handler reads. + // This avoids recreating the handler (and removing/re-adding listeners) on + // every render, which was causing the flicker. + const dragStateRef: React.MutableRefObject = + useRef(null); + const componentRef: React.MutableRefObject = + useRef(component); + const propsRef: React.MutableRefObject = + useRef(props); const dashboardComponentRef: React.RefObject = useRef(null); + // Keep refs in sync with latest values on every render. + componentRef.current = component; + propsRef.current = props; + const isDraggingOrResizing: boolean = interactionMode !== "idle"; - const eachDashboardUnitInPx: number = GetDashboardUnitWidthInPx( - props.totalCurrentDashboardWidthInPx, - ); - - const clampPosition: (data: { - top: number; - left: number; - width: number; - height: number; - }) => { top: number; left: number } = useCallback((data: { - top: number; - left: number; - width: number; - height: number; - }): { top: number; left: number } => { - let newTop: number = data.top; - let newLeft: number = data.left; - - const maxLeft: number = DefaultDashboardSize.widthInDashboardUnits - data.width; - const maxTop: number = props.dashboardViewConfig.heightInDashboardUnits - data.height; - - if (newTop > maxTop) { - newTop = maxTop; - } - if (newLeft > maxLeft) { - newLeft = maxLeft; - } - if (newTop < 0) { - newTop = 0; - } - if (newLeft < 0) { - newLeft = 0; - } - - return { top: newTop, left: newLeft }; - }, [props.dashboardViewConfig.heightInDashboardUnits]); - - const clampSize: (data: { - width: number; - height: number; - }) => { width: number; height: number } = useCallback((data: { - width: number; - height: number; - }): { width: number; height: number } => { - let newWidth: number = data.width; - let newHeight: number = data.height; - - if (newWidth < component.minWidthInDashboardUnits) { - newWidth = component.minWidthInDashboardUnits; - } - if (newWidth > DefaultDashboardSize.widthInDashboardUnits) { - newWidth = DefaultDashboardSize.widthInDashboardUnits; - } - if (newHeight < component.minHeightInDashboardUnits) { - newHeight = component.minHeightInDashboardUnits; - } - - return { width: newWidth, height: newHeight }; - }, [component.minWidthInDashboardUnits, component.minHeightInDashboardUnits]); - + // Stable handler — never recreated. Reads everything from refs. const handleMouseMove: (event: MouseEvent) => void = useCallback( (event: MouseEvent): void => { - if (!dragStateRef.current) { + const state: DragState | null = dragStateRef.current; + + if (!state) { return; } - const state: DragState = dragStateRef.current; + const currentComponent: DashboardBaseComponent = componentRef.current; + const currentProps: ComponentProps = propsRef.current; + const unitPx: number = GetDashboardUnitWidthInPx( + currentProps.totalCurrentDashboardWidthInPx, + ); const deltaXInPx: number = event.clientX - state.startMouseX; const deltaYInPx: number = event.clientY - state.startMouseY; - if (interactionMode === "moving") { - const deltaXUnits: number = Math.round(deltaXInPx / eachDashboardUnitInPx); - const deltaYUnits: number = Math.round(deltaYInPx / eachDashboardUnitInPx); + if (state.mode === "moving") { + const deltaXUnits: number = Math.round(deltaXInPx / unitPx); + const deltaYUnits: number = Math.round(deltaYInPx / unitPx); - const clamped: { top: number; left: number } = clampPosition({ - top: state.startComponentTop + deltaYUnits, - left: state.startComponentLeft + deltaXUnits, - width: component.widthInDashboardUnits, - height: component.heightInDashboardUnits, - }); + let newTop: number = state.startComponentTop + deltaYUnits; + let newLeft: number = state.startComponentLeft + deltaXUnits; - props.onComponentUpdate({ - ...component, - topInDashboardUnits: clamped.top, - leftInDashboardUnits: clamped.left, - }); - } else if (interactionMode === "resizing-width") { + // Clamp to bounds + const maxLeft: number = + DefaultDashboardSize.widthInDashboardUnits - + currentComponent.widthInDashboardUnits; + const maxTop: number = + currentProps.dashboardViewConfig.heightInDashboardUnits - + currentComponent.heightInDashboardUnits; + + if (newTop > maxTop) { + newTop = maxTop; + } + if (newLeft > maxLeft) { + newLeft = maxLeft; + } + if (newTop < 0) { + newTop = 0; + } + if (newLeft < 0) { + newLeft = 0; + } + + // Only update if position actually changed + if ( + newTop !== currentComponent.topInDashboardUnits || + newLeft !== currentComponent.leftInDashboardUnits + ) { + currentProps.onComponentUpdate({ + ...currentComponent, + topInDashboardUnits: newTop, + leftInDashboardUnits: newLeft, + }); + } + } else if (state.mode === "resizing-width") { if (!dashboardComponentRef.current) { return; } - const newWidthPx: number = - event.pageX - - (window.scrollX + dashboardComponentRef.current.getBoundingClientRect().left); + const rect: DOMRect = + dashboardComponentRef.current.getBoundingClientRect(); + const newWidthPx: number = event.pageX - (window.scrollX + rect.left); let widthUnits: number = GetDashboardComponentWidthInDashboardUnits( - props.totalCurrentDashboardWidthInPx, - newWidthPx, + currentProps.totalCurrentDashboardWidthInPx, + Math.max(newWidthPx, unitPx), ); - const clamped: { width: number; height: number } = clampSize({ - width: widthUnits, - height: component.heightInDashboardUnits, - }); - widthUnits = clamped.width; - props.onComponentUpdate({ - ...component, - widthInDashboardUnits: widthUnits, - }); - } else if (interactionMode === "resizing-height") { + if (widthUnits < currentComponent.minWidthInDashboardUnits) { + widthUnits = currentComponent.minWidthInDashboardUnits; + } + if (widthUnits > DefaultDashboardSize.widthInDashboardUnits) { + widthUnits = DefaultDashboardSize.widthInDashboardUnits; + } + + if (widthUnits !== currentComponent.widthInDashboardUnits) { + currentProps.onComponentUpdate({ + ...currentComponent, + widthInDashboardUnits: widthUnits, + }); + } + } else if (state.mode === "resizing-height") { if (!dashboardComponentRef.current) { return; } - const newHeightPx: number = - event.pageY - - (window.scrollY + dashboardComponentRef.current.getBoundingClientRect().top); + const rect: DOMRect = + dashboardComponentRef.current.getBoundingClientRect(); + const newHeightPx: number = event.pageY - (window.scrollY + rect.top); let heightUnits: number = GetDashboardComponentHeightInDashboardUnits( - props.totalCurrentDashboardWidthInPx, - newHeightPx, + currentProps.totalCurrentDashboardWidthInPx, + Math.max(newHeightPx, unitPx), ); - const clamped: { width: number; height: number } = clampSize({ - width: component.widthInDashboardUnits, - height: heightUnits, - }); - heightUnits = clamped.height; - props.onComponentUpdate({ - ...component, - heightInDashboardUnits: heightUnits, - }); - } else if (interactionMode === "resizing-corner") { + if (heightUnits < currentComponent.minHeightInDashboardUnits) { + heightUnits = currentComponent.minHeightInDashboardUnits; + } + + if (heightUnits !== currentComponent.heightInDashboardUnits) { + currentProps.onComponentUpdate({ + ...currentComponent, + heightInDashboardUnits: heightUnits, + }); + } + } else if (state.mode === "resizing-corner") { if (!dashboardComponentRef.current) { return; } - const rect: DOMRect = dashboardComponentRef.current.getBoundingClientRect(); + const rect: DOMRect = + dashboardComponentRef.current.getBoundingClientRect(); const newWidthPx: number = event.pageX - (window.scrollX + rect.left); const newHeightPx: number = event.pageY - (window.scrollY + rect.top); let widthUnits: number = GetDashboardComponentWidthInDashboardUnits( - props.totalCurrentDashboardWidthInPx, - newWidthPx, + currentProps.totalCurrentDashboardWidthInPx, + Math.max(newWidthPx, unitPx), ); let heightUnits: number = GetDashboardComponentHeightInDashboardUnits( - props.totalCurrentDashboardWidthInPx, - newHeightPx, + currentProps.totalCurrentDashboardWidthInPx, + Math.max(newHeightPx, unitPx), ); - const clamped: { width: number; height: number } = clampSize({ - width: widthUnits, - height: heightUnits, - }); - widthUnits = clamped.width; - heightUnits = clamped.height; + if (widthUnits < currentComponent.minWidthInDashboardUnits) { + widthUnits = currentComponent.minWidthInDashboardUnits; + } + if (widthUnits > DefaultDashboardSize.widthInDashboardUnits) { + widthUnits = DefaultDashboardSize.widthInDashboardUnits; + } + if (heightUnits < currentComponent.minHeightInDashboardUnits) { + heightUnits = currentComponent.minHeightInDashboardUnits; + } - props.onComponentUpdate({ - ...component, - widthInDashboardUnits: widthUnits, - heightInDashboardUnits: heightUnits, - }); + if ( + widthUnits !== currentComponent.widthInDashboardUnits || + heightUnits !== currentComponent.heightInDashboardUnits + ) { + currentProps.onComponentUpdate({ + ...currentComponent, + widthInDashboardUnits: widthUnits, + heightInDashboardUnits: heightUnits, + }); + } } }, - [interactionMode, eachDashboardUnitInPx, component, clampPosition, clampSize, props], + [], // No dependencies — reads from refs ); + // Stable handler — never recreated. const handleMouseUp: () => void = useCallback((): void => { dragStateRef.current = null; setInteractionMode("idle"); document.body.style.cursor = ""; document.body.style.userSelect = ""; - }, []); + window.removeEventListener("mousemove", handleMouseMove); + window.removeEventListener("mouseup", handleMouseUp); + }, [handleMouseMove]); + // Clean up listeners if the component unmounts mid-drag. useEffect(() => { - if (interactionMode !== "idle") { - window.addEventListener("mousemove", handleMouseMove); - window.addEventListener("mouseup", handleMouseUp); - document.body.style.userSelect = "none"; - } - return () => { window.removeEventListener("mousemove", handleMouseMove); window.removeEventListener("mouseup", handleMouseUp); }; - }, [interactionMode, handleMouseMove, handleMouseUp]); + }, [handleMouseMove, handleMouseUp]); const startInteraction: ( event: React.MouseEvent, @@ -282,17 +282,26 @@ const DashboardBaseComponentElement: FunctionComponent = ( event.preventDefault(); event.stopPropagation(); + const currentComponent: DashboardBaseComponent = componentRef.current; + dragStateRef.current = { + mode, startMouseX: event.clientX, startMouseY: event.clientY, - startComponentTop: component.topInDashboardUnits, - startComponentLeft: component.leftInDashboardUnits, - startComponentWidth: component.widthInDashboardUnits, - startComponentHeight: component.heightInDashboardUnits, + startComponentTop: currentComponent.topInDashboardUnits, + startComponentLeft: currentComponent.leftInDashboardUnits, + startComponentWidth: currentComponent.widthInDashboardUnits, + startComponentHeight: currentComponent.heightInDashboardUnits, }; setInteractionMode(mode); + // Attach listeners directly — not via useEffect. + window.addEventListener("mousemove", handleMouseMove); + window.addEventListener("mouseup", handleMouseUp); + + document.body.style.userSelect = "none"; + if (mode === "moving") { document.body.style.cursor = "grabbing"; } else if (mode === "resizing-width") { @@ -312,21 +321,20 @@ const DashboardBaseComponentElement: FunctionComponent = ( } else if (props.isSelected && props.isEditMode) { className += " border-blue-400 ring-2 ring-blue-100 shadow-lg z-10"; } else if (props.isEditMode && isHovered) { - className += " border-blue-300 shadow-md z-10 cursor-pointer"; + className += + " border-blue-300 shadow-md z-10 cursor-pointer"; } else if (props.isEditMode) { - className += " border-gray-200 hover:border-blue-300 hover:shadow-md cursor-pointer transition-all duration-200"; + className += + " border-gray-200 hover:border-blue-300 hover:shadow-md cursor-pointer transition-all duration-200"; } else { - className += " border-gray-200 hover:shadow-md transition-shadow duration-200"; + className += + " border-gray-200 hover:shadow-md transition-shadow duration-200"; } - const showHandles: boolean = props.isEditMode && (props.isSelected || isHovered); + const showHandles: boolean = + props.isEditMode && (props.isSelected || isHovered); const getMoveHandle: GetReactElementFunction = (): ReactElement => { - if (!props.isEditMode) { - return <>; - } - - // Full-width top drag bar visible on hover or selection if (!showHandles) { return <>; } @@ -336,10 +344,13 @@ const DashboardBaseComponentElement: FunctionComponent = ( className="absolute top-0 left-0 right-0 z-20 flex items-center justify-center cursor-grab active:cursor-grabbing" style={{ height: "28px", - background: "linear-gradient(180deg, rgba(59,130,246,0.08) 0%, rgba(59,130,246,0.02) 100%)", + background: + "linear-gradient(180deg, rgba(59,130,246,0.08) 0%, rgba(59,130,246,0.02) 100%)", borderBottom: "1px solid rgba(59,130,246,0.12)", }} - onMouseDown={(event: React.MouseEvent) => { + onMouseDown={( + event: React.MouseEvent, + ) => { startInteraction(event, "moving"); }} > @@ -373,11 +384,12 @@ const DashboardBaseComponentElement: FunctionComponent = ( width: "8px", cursor: "ew-resize", }} - onMouseDown={(event: React.MouseEvent) => { + onMouseDown={( + event: React.MouseEvent, + ) => { startInteraction(event, "resizing-width"); }} > - {/* Visible handle bar */}
= ( height: "8px", cursor: "ns-resize", }} - onMouseDown={(event: React.MouseEvent) => { + onMouseDown={( + event: React.MouseEvent, + ) => { startInteraction(event, "resizing-height"); }} > - {/* Visible handle bar */}
= ( height: "16px", cursor: "nwse-resize", }} - onMouseDown={(event: React.MouseEvent) => { + onMouseDown={( + event: React.MouseEvent, + ) => { startInteraction(event, "resizing-corner"); }} > - {/* Corner triangle indicator */}
= ( ? "0 4px 12px -2px rgba(59, 130, 246, 0.12), 0 2px 4px -1px rgba(0, 0, 0, 0.04)" : "0 1px 3px 0 rgba(0, 0, 0, 0.04), 0 1px 2px -1px rgba(0, 0, 0, 0.03)", transform: isDraggingOrResizing ? "scale(1.01)" : "scale(1)", - transition: isDraggingOrResizing ? "none" : "box-shadow 0.2s ease, transform 0.15s ease, border-color 0.2s ease", + transition: isDraggingOrResizing + ? "none" + : "box-shadow 0.2s ease, transform 0.15s ease, border-color 0.2s ease", }} key={component.componentId?.toString() || Math.random().toString()} ref={dashboardComponentRef} @@ -530,16 +546,18 @@ const DashboardBaseComponentElement: FunctionComponent = ( setIsHovered(true); }} onMouseLeave={() => { - setIsHovered(false); + if (!isDraggingOrResizing) { + setIsHovered(false); + } }} > {getMoveHandle()} {getSizeTooltip()} - {/* Component type badge - visible on hover or selection in edit mode */} + {/* Component type badge */} {props.isEditMode && (props.isSelected || isHovered) && (
= (
diff --git a/App/FeatureSet/Dashboard/src/Pages/Dashboards/View/AuthenticationSettings.tsx b/App/FeatureSet/Dashboard/src/Pages/Dashboards/View/AuthenticationSettings.tsx index 9efc6c3b17..9bdde5b2de 100644 --- a/App/FeatureSet/Dashboard/src/Pages/Dashboards/View/AuthenticationSettings.tsx +++ b/App/FeatureSet/Dashboard/src/Pages/Dashboards/View/AuthenticationSettings.tsx @@ -163,6 +163,7 @@ const DashboardAuthenticationSettings: FunctionComponent< formType: FormType.Update, modelType: Dashboard, steps: [], + doNotFetchExistingModel: true, }} modelIdToEdit={modelId} /> diff --git a/App/FeatureSet/Dashboard/src/Pages/StatusPages/View/AuthenticationSettings.tsx b/App/FeatureSet/Dashboard/src/Pages/StatusPages/View/AuthenticationSettings.tsx index 8b4b80f415..17fd60406a 100644 --- a/App/FeatureSet/Dashboard/src/Pages/StatusPages/View/AuthenticationSettings.tsx +++ b/App/FeatureSet/Dashboard/src/Pages/StatusPages/View/AuthenticationSettings.tsx @@ -103,12 +103,15 @@ const StatusPageDelete: FunctionComponent< placeholder: "No", }, { - field: { - masterPassword: true, - }, title: "Master Password", - fieldType: FieldType.HiddenText, - placeholder: "Not Set", + fieldType: FieldType.Element, + getElement: (): ReactElement => { + return ( +

+ {isMasterPasswordSet ? "Password is set." : "Not set."} +

+ ); + }, }, ], modelId: modelId, @@ -151,6 +154,7 @@ const StatusPageDelete: FunctionComponent< formType: FormType.Update, modelType: StatusPage, steps: [], + doNotFetchExistingModel: true, }} modelIdToEdit={modelId} /> diff --git a/Telemetry/Docs/opentelemetry-profiles-roadmap.md b/Telemetry/Docs/opentelemetry-profiles-roadmap.md index 59484d0036..e6448670d7 100644 --- a/Telemetry/Docs/opentelemetry-profiles-roadmap.md +++ b/Telemetry/Docs/opentelemetry-profiles-roadmap.md @@ -116,7 +116,11 @@ message Sample { } ``` -### 1.2 Register HTTP Endpoint +### 1.2 Add TelemetryType Enum Value + +In `Common/Types/Telemetry/TelemetryType.ts`, add `Profile = "Profile"` to the existing enum (currently: Metric, Trace, Log, Exception). + +### 1.3 Register HTTP Endpoint In `Telemetry/API/OTelIngest.ts`, add: @@ -124,29 +128,38 @@ In `Telemetry/API/OTelIngest.ts`, add: POST /otlp/v1/profiles ``` -Follow the same pattern as traces/metrics/logs: -1. Parse protobuf or JSON body via `OtelRequestMiddleware` -2. Authenticate via `TelemetryIngest` middleware +Follow the same middleware chain as traces/metrics/logs: +1. `OpenTelemetryRequestMiddleware.getProductType` — Decode protobuf/JSON, set `ProductType.Profiles` +2. `TelemetryIngest.isAuthorizedServiceMiddleware` — Validate `x-oneuptime-token`, extract `projectId` 3. Return 202 immediately 4. Queue for async processing -### 1.3 Register gRPC Service +### 1.4 Register gRPC Service In `Telemetry/GrpcServer.ts`, register the `ProfilesService/Export` RPC handler alongside the existing trace/metrics/logs handlers. -### 1.4 Update OTel Collector Config +### 1.5 Update OTel Collector Config -In `OTelCollector/otel-collector-config.template.yaml`, add a `profiles` pipeline: +In `OTelCollector/otel-collector-config.template.yaml`, add a `profiles` pipeline to the existing three pipelines (traces, metrics, logs): ```yaml service: pipelines: profiles: receivers: [otlp] - processors: [batch] + processors: [] exporters: [otlphttp] ``` +**Note:** The OTel Collector in OneUptime is primarily used by the Kubernetes Agent. The main telemetry service handles OTLP ingestion directly. Also note: the OTel Arrow receiver does NOT yet support profiles. + +### 1.6 Helm Chart Updates + +In `HelmChart/Public/oneuptime/templates/telemetry.yaml`: +- No port changes needed (profiles use the same gRPC 4317 and HTTP 3403 ports) +- Add `TELEMETRY_PROFILE_FLUSH_BATCH_SIZE` environment variable +- Update KEDA autoscaling config to account for profiles queue load + ### Estimated Effort: 1-2 weeks --- @@ -175,8 +188,11 @@ Create `Common/Models/AnalyticsModels/Profile.ts` following the pattern of `Span | `unit` | String | e.g., `nanoseconds`, `bytes`, `count` | | `periodType` | String | Sampling period type | | `period` | Int64 | Sampling period value | -| `attributes` | String (JSON) | Profile-level attributes | +| `attributes` | String (JSON) | Profile-level attributes (note: `KeyValueAndUnit`, not `KeyValue` — includes `unit` field) | | `resourceAttributes` | String (JSON) | Resource attributes | +| `originalPayloadFormat` | String | e.g., `pprofext` — for pprof round-tripping | +| `originalPayload` | String (base64) | Raw pprof bytes (optional, for lossless re-export) | +| `retentionDate` | DateTime64 | TTL column for automatic expiry (pattern from existing tables) | **Proposed ClickHouse Table: `profile_sample`** @@ -187,14 +203,17 @@ This is the high-volume table storing individual samples (denormalized for query | `projectId` | String (ObjectID) | Tenant ID | | `serviceId` | String (ObjectID) | Service reference | | `profileId` | String | FK to profile table | -| `traceId` | String | Trace correlation | -| `spanId` | String | Span correlation | +| `traceId` | String | Trace correlation (from Link table) | +| `spanId` | String | Span correlation (from Link table) | | `time` | DateTime64(9) | Sample timestamp | | `stacktrace` | Array(String) | Fully-resolved stack frames (function@file:line) | | `stacktraceHash` | String | Hash of stacktrace for grouping | +| `frameTypes` | Array(String) | Per-frame runtime type (`kernel`, `native`, `jvm`, `cpython`, `go`, `v8js`, etc.) | | `value` | Int64 | Sample value (CPU time, bytes, count) | | `profileType` | String | Denormalized for filtering | | `labels` | String (JSON) | Sample-level labels | +| `buildId` | String | Executable build ID (for deferred symbolization) | +| `retentionDate` | DateTime64 | TTL column for automatic expiry | **Table Engine & Indexing:** - Engine: `MergeTree` @@ -209,10 +228,24 @@ This is the high-volume table storing individual samples (denormalized for query **Why two tables?** - The `profile` table stores metadata and is low-volume — used for listing/filtering profiles. - The `profile_sample` table stores denormalized samples — high-volume but optimized for flamegraph aggregation queries. +- This mirrors the existing pattern where `ExceptionInstance` (ClickHouse) is a sub-signal of `Span`, with its own table but linked via `traceId`/`spanId`. - Alternative: A single table with nested arrays for samples. This is more storage-efficient but makes aggregation queries harder. Start with two tables and revisit if needed. **Denormalization strategy:** -The OTLP Profiles wire format uses dictionary-based deduplication (string tables, function tables, location tables). At ingestion time, we should **resolve all references** and store fully-materialized stack frames. This trades storage space for query simplicity — the same approach used for span attributes today. +The OTLP Profiles wire format uses dictionary-based deduplication (string tables, function tables, location tables). **Critically, the `ProfilesDictionary` is shared across ALL profiles in a `ProfilesData` batch** — you cannot process individual profiles without the batch-level dictionary context. + +At ingestion time, we should **resolve all dictionary references** and store fully-materialized stack frames. This trades storage space for query simplicity — the same approach used for span attributes today. + +**Inline frame handling:** +`Location.lines` is a repeated field supporting inlined functions — a single location can expand to multiple logical frames. The denormalization logic must expand these into the full stacktrace array. + +**`original_payload` storage decision:** +The `Profile` message includes `original_payload_format` and `original_payload` fields containing the raw pprof bytes. Storing this enables lossless pprof round-trip export but significantly increases storage. Options: +- **Store always**: Full pprof compatibility, ~2-5x storage increase +- **Store on demand**: Only when `original_payload_format` is set (opt-in by producer) +- **Don't store**: Reconstruct pprof from denormalized data (lossy for some edge cases) + +Recommendation: Store on demand (option 2) — only persist when the producer explicitly includes it. **Expected data volume:** - A typical eBPF profiler generates ~10-100 samples/second per process @@ -222,7 +255,17 @@ The OTLP Profiles wire format uses dictionary-based deduplication (string tables ### 2.3 Create Database Service -Create `Common/Server/Services/ProfileService.ts` extending `AnalyticsDatabaseService`. +Create `Common/Server/Services/ProfileService.ts` and `Common/Server/Services/ProfileSampleService.ts` extending `AnalyticsDatabaseService` and `AnalyticsDatabaseService`. + +Add `TableBillingAccessControl` to both models following the pattern in existing analytics models to enable plan-based billing constraints on profile ingestion/querying. + +### 2.4 Data Migration + +Follow the migration pattern from `Worker/DataMigrations/AddRetentionDateAndSkipIndexesToTelemetryTables.ts`: +- Add `retentionDate` column with TTL expression: `retentionDate DELETE` +- Add skip indexes: `bloom_filter` on `traceId`, `profileId`, `stacktraceHash`; `set` on `profileType` +- Apply `ZSTD(3)` codec on `stacktrace` and `labels` columns (high compression benefit) +- Default retention: 15 days (matching existing telemetry defaults) ### Estimated Effort: 2-3 weeks @@ -265,25 +308,49 @@ Create `Telemetry/Services/Queue/ProfilesQueueService.ts`: **Denormalization logic** (the hardest part of this phase): -The OTLP Profile message uses dictionary tables for compression. The ingestion service must resolve these: +The OTLP Profile message uses dictionary tables for compression. **The dictionary is batch-scoped** — it lives on the `ProfilesData` message, not on individual `Profile` messages. The ingestion service must pass the dictionary when processing each profile. ``` -For each sample in profile.sample: - For each location_index in sample.location_index: - location = profile.location[location_index] - For each line in location.line: - function = profile.function[line.function_index] - function_name = profile.string_table[function.name] - file_name = profile.string_table[function.filename] - frame = "${function_name}@${file_name}:${line.line}" - Build stacktrace array from frames - Compute stacktrace_hash = hash(stacktrace) - Extract value from sample.value[type_index] - Write denormalized row to buffer +dictionary = profilesData.dictionary // batch-level dictionary + +For each resourceProfiles in profilesData.resource_profiles: + For each scopeProfiles in resourceProfiles.scope_profiles: + For each profile in scopeProfiles.profiles: + For each sample in profile.sample: + stack = dictionary.stack_table[sample.stack_index] + For each location_index in stack.location_indices: + location = dictionary.location_table[location_index] + // Handle INLINE FRAMES: location.lines is repeated + For each line in location.lines: + function = dictionary.function_table[line.function_index] + function_name = dictionary.string_table[function.name_strindex] + system_name = dictionary.string_table[function.system_name_strindex] // mangled name + file_name = dictionary.string_table[function.filename_strindex] + frame_type = attributes[profile.frame.type] // kernel, native, jvm, etc. + frame = "${function_name}@${file_name}:${line.line}" + Build stacktrace array from all frames (including inlined) + Compute stacktrace_hash = SHA256(stacktrace) + + // Resolve trace correlation from Link table + link = dictionary.link_table[sample.link_index] + trace_id = link.trace_id + span_id = link.span_id + + // Note: sample.timestamps_unix_nano is REPEATED (multiple timestamps per sample) + // Use first timestamp as sample time, store all if needed + + Extract value from sample.values[type_index] + Write denormalized row to buffer ``` +**Mixed-runtime stacks:** +The eBPF agent produces stacks that cross kernel/native/managed boundaries (e.g., kernel → libc → JVM → application Java code). Each frame has a `profile.frame.type` attribute. Store this per-frame in the `frameTypes` array column for proper rendering. + +**Unsymbolized frames:** +Not all frames will be symbolized at ingestion time (especially native/kernel frames from eBPF). Store the mapping `build_id` attributes (`process.executable.build_id.gnu`, `.go`, `.htlhash`) so frames can be symbolized later when debug info becomes available. See Phase 6 for symbolization pipeline. + **pprof interoperability:** -Store enough metadata to reconstruct pprof format for export. The OTLP Profiles format supports round-trip conversion to/from pprof with no information loss. +If `original_payload_format` is set (e.g., `pprofext`), store the `original_payload` bytes for lossless re-export. The OTLP Profiles format supports round-trip conversion to/from pprof with no information loss. ### Estimated Effort: 2-3 weeks @@ -374,16 +441,26 @@ Add to `App/FeatureSet/Dashboard/src/`: | `Components/Profiles/DiffFlameGraph.tsx` | Side-by-side or differential flamegraph comparing two time ranges | | `Components/Profiles/ProfileTimeline.tsx` | Timeline showing profile sample density over time | -### 5.3 Cross-Signal Integration +**Frame type color coding:** +Mixed-runtime stacks from the eBPF agent contain frames from different runtimes (kernel, native, JVM, CPython, Go, V8, etc.). The flamegraph component should color-code frames by their `profile.frame.type` attribute so users can visually distinguish application code from kernel/native/runtime internals. Suggested palette: +- Kernel frames: red/orange +- Native (C/C++/Rust): blue +- JVM/Go/V8/CPython/Ruby: green shades (per runtime) + +### 5.3 Sidebar Navigation + +Create `Pages/Profiles/SideMenu.tsx` following the existing pattern (see `Pages/Traces/SideMenu.tsx`, `Pages/Metrics/SideMenu.tsx`, `Pages/Logs/SideMenu.tsx`): +- Main section: "Profiles" → PageMap.PROFILES +- Documentation section: Link to PROFILES_DOCUMENTATION route + +Add "Profiles" entry to the main dashboard navigation sidebar. + +### 5.4 Cross-Signal Integration - **Trace Detail Page**: Add a "Profile" tab/button on `TraceExplorer.tsx` that links to the flamegraph filtered by `traceId`. - **Span Detail**: When viewing a span, show an inline flamegraph if profile samples exist for that `spanId`. - **Service Overview**: Add a "Profiles" tab on the service detail page showing aggregated flamegraphs. -### 5.4 Navigation - -Add "Profiles" to the dashboard sidebar navigation alongside Traces, Metrics, and Logs. - ### Estimated Effort: 3-4 weeks --- @@ -394,28 +471,49 @@ Add "Profiles" to the dashboard sidebar navigation alongside Traces, Metrics, an ### 6.1 Data Retention & Billing -- Add `profileRetentionInDays` to service-level settings (alongside existing telemetry retention) -- Add billing metering for profile sample ingestion (samples/month) -- Apply TTL rules on ClickHouse tables +- Add `profileRetentionInDays` to service-level settings (alongside existing `retainTelemetryDataForDays`) +- Add billing metering for profile sample ingestion (samples/month) via `TableBillingAccessControl` +- Apply TTL rules on ClickHouse tables using `retentionDate DELETE` pattern ### 6.2 Performance Optimization - **Materialized Views**: Pre-aggregate top functions per service per hour for fast dashboard loading - **Sampling**: For high-volume services, support server-side downsampling of profile data -- **Compression**: Evaluate dictionary encoding for `stacktrace` column (high repetition rate) +- **Compression**: Apply `ZSTD(3)` codec on `stacktrace`, `labels`, and `originalPayload` columns - **Query Caching**: Cache aggregated flamegraph results for repeated time ranges -### 6.3 Alerting Integration +### 6.3 Symbolization Pipeline -- Allow alerting on profile metrics (e.g., "alert when function X exceeds Y% of CPU") +**This is a significant piece of work.** Symbolization is NOT yet standardized in the OTel Profiles spec. OneUptime needs its own strategy: + +1. **Store build IDs at ingestion**: Persist `process.executable.build_id.gnu`, `.go`, `.htlhash` attributes from mappings +2. **Accept symbol uploads**: Provide an API endpoint where users can upload debug symbols (DWARF, PDB, source maps) keyed by build ID +3. **Deferred symbolization**: When symbols are uploaded, re-symbolize existing unsymbolized frames in ClickHouse by matching `buildId` + address +4. **Symbol storage**: Store uploaded symbols in object storage (S3/MinIO), indexed by build ID hash + +This can be deferred to a later release — the eBPF agent handles on-target symbolization for Go, and many runtimes (JVM, CPython, V8) provide symbol info at collection time. Native/kernel frames are the main gap. + +### 6.4 Alerting & Monitoring Integration + +Following the existing pattern in `Worker/Jobs/TelemetryMonitor/MonitorTelemetryMonitor.ts`: +- Add `MonitorStepProfileMonitor` configuration type +- Add `ProfileMonitorResponse` response type +- Add `MonitorType.Profiles` to the monitor type enum +- Enable alerting on profile metrics (e.g., "alert when function X exceeds Y% of CPU") - Surface profile data in incident timelines -### 6.4 pprof Export +### 6.5 pprof Export - Add `GET /profiles/:profileId/pprof` endpoint that converts stored data back to pprof format +- If `original_payload` was stored, return it directly (lossless) +- Otherwise, reconstruct pprof from denormalized data - Enables users to download and analyze profiles with existing tools (go tool pprof, etc.) -### Estimated Effort: 2-3 weeks +### 6.6 Conformance Validation + +Integrate the OTel `profcheck` conformance checker tool into CI to validate that OneUptime correctly accepts and processes compliant profiles. This catches regressions when upgrading proto definitions. + +### Estimated Effort: 3-4 weeks --- @@ -447,10 +545,12 @@ Add `Telemetry/Docs/profileData.example.json` with a sample OTLP Profiles payloa | 3 | Ingestion Service | 2-3 weeks | Phase 1, 2 | | 4 | Query API | 2 weeks | Phase 2, 3 | | 5 | Frontend — Profiles UI | 3-4 weeks | Phase 4 | -| 6 | Production Hardening | 2-3 weeks | Phase 5 | +| 6 | Production Hardening (incl. symbolization, alerting, conformance) | 3-4 weeks | Phase 5 | | 7 | Documentation & Launch | 1 week | Phase 6 | -**Total estimated effort: 13-19 weeks** (with parallelization of phases 4+5, closer to 10-14 weeks) +**Total estimated effort: 14-21 weeks** (with parallelization of phases 4+5, closer to 11-16 weeks) + +**Suggested MVP scope (Phases 1-5):** Ship ingestion + storage + basic flamegraph UI first (~9-14 weeks). Symbolization, alerting integration, and pprof export can follow as iterative improvements. --- @@ -459,10 +559,14 @@ Add `Telemetry/Docs/profileData.example.json` with a sample OTLP Profiles payloa | Risk | Impact | Mitigation | |------|--------|------------| | OTLP Profiles is still Alpha — proto schema may change | Breaking changes to ingestion | Pin to specific OTLP proto version (v1.10.0+), add version detection | -| High storage volume from continuous profiling | ClickHouse disk/cost growth | Server-side sampling, aggressive TTL defaults (7 days), compression tuning | -| Flamegraph rendering performance with large profiles | Slow UI | Limit to top 10K stacktraces, lazy-load deep frames, pre-aggregate | -| Denormalization complexity in ingestion | Bugs, data loss | Extensive unit tests with real pprof data, conformance checker validation | +| `v1development` package path will change to `v1` at GA | Proto import path migration | Abstract proto version behind internal types; plan migration script for when GA lands | +| High storage volume from continuous profiling | ClickHouse disk/cost growth | Server-side sampling, aggressive TTL defaults (15 days), ZSTD(3) compression | +| Flamegraph rendering performance with large profiles | Slow UI | Limit to top 10K stacktraces, lazy-load deep frames, pre-aggregate via materialized views | +| Denormalization complexity (batch-scoped dictionary, inline frames, mixed runtimes) | Bugs, data loss | Extensive unit tests with real pprof data, conformance checker validation, test with eBPF agent output | +| Symbolization is not standardized | Unsymbolized frames in flamegraphs | Store build IDs for deferred symbolization; accept eBPF agent's on-target symbolization as baseline | +| Semantic conventions are minimal (only `profile.frame.type`) | Schema may need changes as conventions mature | Keep attribute storage flexible (JSON columns); avoid hardcoding specific attribute names | | Limited client-side instrumentation maturity | Low adoption | Start with eBPF profiler (no code changes needed), expand as ecosystem matures | +| `original_payload` can be large | Storage bloat | Store on-demand only (when producer sets `original_payload_format`), not by default | ---