feat: Add profiling support across services and implement new metrics

- Integrated profiling initialization in Probe, Telemetry, TestServer, and Worker services.
- Added environment variables for enabling profiling in various services.
- Created Profiling utility to handle CPU profiling and send data to OTLP endpoint.
- Introduced new metric types for exceptions, spans, and dashboards.
- Developed utility classes for handling alert and incident metrics.
- Added new React components for displaying alert and incident metrics in the dashboard.
This commit is contained in:
Nawaz Dhandala
2026-03-31 13:44:59 +01:00
parent fe5329a1aa
commit d7a339b9aa
28 changed files with 1564 additions and 280 deletions

View File

@@ -11,6 +11,7 @@ import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
import logger from "Common/Server/Utils/Logger";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import Express, { ExpressApplication } from "Common/Server/Utils/Express";
import "ejs";
@@ -23,6 +24,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
logger.info("AI Agent Service - Starting...");
// init the app

View File

@@ -0,0 +1,123 @@
import React, {
FunctionComponent,
ReactElement,
useCallback,
useState,
} from "react";
import ObjectID from "Common/Types/ObjectID";
import AlertMetricType from "Common/Types/Alerts/AlertMetricType";
import AlertMetricTypeUtil from "Common/Utils/Alerts/AlertMetricType";
import MetricView from "../Metrics/MetricView";
import ProjectUtil from "Common/UI/Utils/Project";
import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData";
import MetricViewData from "Common/Types/Metrics/MetricViewData";
import InBetween from "Common/Types/BaseDatabase/InBetween";
import RangeStartAndEndDateTime, {
RangeStartAndEndDateTimeUtil,
} from "Common/Types/Time/RangeStartAndEndDateTime";
import TimeRange from "Common/Types/Time/TimeRange";
import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView";
import Card from "Common/UI/Components/Card/Card";
export interface ComponentProps {
monitorId: ObjectID;
}
const MonitorAlertMetrics: FunctionComponent<ComponentProps> = (
props: ComponentProps,
): ReactElement => {
const alertMetricTypes: Array<AlertMetricType> =
AlertMetricTypeUtil.getAllAlertMetricTypes();
const [timeRange, setTimeRange] = useState<RangeStartAndEndDateTime>({
range: TimeRange.PAST_ONE_DAY,
});
type GetQueryConfigsFunction = () => Array<MetricQueryConfigData>;
const getQueryConfigs: GetQueryConfigsFunction =
(): Array<MetricQueryConfigData> => {
const queries: Array<MetricQueryConfigData> = [];
for (const metricType of alertMetricTypes) {
queries.push({
metricAliasData: {
metricVariable: metricType,
title: AlertMetricTypeUtil.getTitleByAlertMetricType(metricType),
description:
AlertMetricTypeUtil.getDescriptionByAlertMetricType(metricType),
legend: AlertMetricTypeUtil.getLegendByAlertMetricType(metricType),
legendUnit:
AlertMetricTypeUtil.getLegendUnitByAlertMetricType(metricType),
},
metricQueryData: {
filterData: {
metricName: metricType,
attributes: {
monitorId: props.monitorId.toString(),
projectId: ProjectUtil.getCurrentProjectId()?.toString() || "",
},
aggegationType:
AlertMetricTypeUtil.getAggregationTypeByAlertMetricType(
metricType,
),
},
groupBy: undefined,
},
});
}
return queries;
};
const [metricViewData, setMetricViewData] = useState<MetricViewData>({
startAndEndDate: RangeStartAndEndDateTimeUtil.getStartAndEndDate({
range: TimeRange.PAST_ONE_DAY,
}),
queryConfigs: getQueryConfigs(),
formulaConfigs: [],
});
const handleTimeRangeChange: (
newTimeRange: RangeStartAndEndDateTime,
) => void = useCallback((newTimeRange: RangeStartAndEndDateTime): void => {
setTimeRange(newTimeRange);
const dateRange: InBetween<Date> =
RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange);
setMetricViewData((prev: MetricViewData) => {
return {
...prev,
startAndEndDate: dateRange,
};
});
}, []);
return (
<Card
title="Alert Metrics"
description="Alert metrics for this monitor - count, time to acknowledge, time to resolve, and duration."
rightElement={
<RangeStartAndEndDateView
dashboardStartAndEndDate={timeRange}
onChange={handleTimeRangeChange}
/>
}
>
<MetricView
data={metricViewData}
hideQueryElements={true}
hideStartAndEndDate={true}
hideCardInCharts={true}
onChange={(data: MetricViewData) => {
setMetricViewData({
...data,
queryConfigs: getQueryConfigs(),
formulaConfigs: [],
});
}}
/>
</Card>
);
};
export default MonitorAlertMetrics;

View File

@@ -0,0 +1,129 @@
import React, {
FunctionComponent,
ReactElement,
useCallback,
useState,
} from "react";
import ObjectID from "Common/Types/ObjectID";
import IncidentMetricType from "Common/Types/Incident/IncidentMetricType";
import IncidentMetricTypeUtil from "Common/Utils/Incident/IncidentMetricType";
import MetricView from "../Metrics/MetricView";
import ProjectUtil from "Common/UI/Utils/Project";
import MetricQueryConfigData from "Common/Types/Metrics/MetricQueryConfigData";
import MetricViewData from "Common/Types/Metrics/MetricViewData";
import InBetween from "Common/Types/BaseDatabase/InBetween";
import RangeStartAndEndDateTime, {
RangeStartAndEndDateTimeUtil,
} from "Common/Types/Time/RangeStartAndEndDateTime";
import TimeRange from "Common/Types/Time/TimeRange";
import RangeStartAndEndDateView from "Common/UI/Components/Date/RangeStartAndEndDateView";
import Card from "Common/UI/Components/Card/Card";
export interface ComponentProps {
monitorId: ObjectID;
}
const MonitorIncidentMetrics: FunctionComponent<ComponentProps> = (
props: ComponentProps,
): ReactElement => {
const incidentMetricTypes: Array<IncidentMetricType> =
IncidentMetricTypeUtil.getAllIncidentMetricTypes();
const [timeRange, setTimeRange] = useState<RangeStartAndEndDateTime>({
range: TimeRange.PAST_ONE_DAY,
});
type GetQueryConfigsFunction = () => Array<MetricQueryConfigData>;
const getQueryConfigs: GetQueryConfigsFunction =
(): Array<MetricQueryConfigData> => {
const queries: Array<MetricQueryConfigData> = [];
for (const metricType of incidentMetricTypes) {
queries.push({
metricAliasData: {
metricVariable: metricType,
title:
IncidentMetricTypeUtil.getTitleByIncidentMetricType(metricType),
description:
IncidentMetricTypeUtil.getDescriptionByIncidentMetricType(
metricType,
),
legend:
IncidentMetricTypeUtil.getLegendByIncidentMetricType(metricType),
legendUnit:
IncidentMetricTypeUtil.getLegendUnitByIncidentMetricType(
metricType,
),
},
metricQueryData: {
filterData: {
metricName: metricType,
attributes: {
monitorIds: props.monitorId.toString(),
projectId: ProjectUtil.getCurrentProjectId()?.toString() || "",
},
aggegationType:
IncidentMetricTypeUtil.getAggregationTypeByIncidentMetricType(
metricType,
),
},
groupBy: undefined,
},
});
}
return queries;
};
const [metricViewData, setMetricViewData] = useState<MetricViewData>({
startAndEndDate: RangeStartAndEndDateTimeUtil.getStartAndEndDate({
range: TimeRange.PAST_ONE_DAY,
}),
queryConfigs: getQueryConfigs(),
formulaConfigs: [],
});
const handleTimeRangeChange: (
newTimeRange: RangeStartAndEndDateTime,
) => void = useCallback((newTimeRange: RangeStartAndEndDateTime): void => {
setTimeRange(newTimeRange);
const dateRange: InBetween<Date> =
RangeStartAndEndDateTimeUtil.getStartAndEndDate(newTimeRange);
setMetricViewData((prev: MetricViewData) => {
return {
...prev,
startAndEndDate: dateRange,
};
});
}, []);
return (
<Card
title="Incident Metrics"
description="Incident metrics for this monitor - count, time to acknowledge, time to resolve, and duration."
rightElement={
<RangeStartAndEndDateView
dashboardStartAndEndDate={timeRange}
onChange={handleTimeRangeChange}
/>
}
>
<MetricView
data={metricViewData}
hideQueryElements={true}
hideStartAndEndDate={true}
hideCardInCharts={true}
onChange={(data: MetricViewData) => {
setMetricViewData({
...data,
queryConfigs: getQueryConfigs(),
formulaConfigs: [],
});
}}
/>
</Card>
);
};
export default MonitorIncidentMetrics;

View File

@@ -1,235 +1,79 @@
import DisabledWarning from "../../../Components/Monitor/DisabledWarning";
import IncidentsTable from "../../../Components/Incident/IncidentsTable";
import AlertsTable from "../../../Components/Alert/AlertsTable";
import MonitorMetricsElement from "../../../Components/Monitor/MonitorMetrics";
import MonitorIncidentMetrics from "../../../Components/Monitor/MonitorIncidentMetrics";
import MonitorAlertMetrics from "../../../Components/Monitor/MonitorAlertMetrics";
import PageComponentProps from "../../PageComponentProps";
import ObjectID from "Common/Types/ObjectID";
import Navigation from "Common/UI/Utils/Navigation";
import React, { Fragment, FunctionComponent, ReactElement, useState } from "react";
import React, { Fragment, FunctionComponent, ReactElement, useEffect, useState } from "react";
import Tabs from "Common/UI/Components/Tabs/Tabs";
import { Tab } from "Common/UI/Components/Tabs/Tab";
import Incident from "Common/Models/DatabaseModels/Incident";
import Alert from "Common/Models/DatabaseModels/Alert";
import Query from "Common/Types/BaseDatabase/Query";
import ProjectUtil from "Common/UI/Utils/Project";
import Includes from "Common/Types/BaseDatabase/Includes";
import MonitorStatusTimeline from "Common/Models/DatabaseModels/MonitorStatusTimeline";
import MonitorStatus from "Common/Models/DatabaseModels/MonitorStatus";
import ModelTable from "Common/UI/Components/ModelTable/ModelTable";
import FieldType from "Common/UI/Components/Types/FieldType";
import SortOrder from "Common/Types/BaseDatabase/SortOrder";
import BadDataException from "Common/Types/Exception/BadDataException";
import Statusbubble from "Common/UI/Components/StatusBubble/StatusBubble";
import { Black } from "Common/Types/BrandColors";
import OneUptimeDate from "Common/Types/Date";
import FormFieldSchemaType from "Common/UI/Components/Forms/Types/FormFieldSchemaType";
import MonitorType from "Common/Types/Monitor/MonitorType";
import MonitorMetricTypeUtil from "Common/Utils/Monitor/MonitorMetricType";
import Monitor from "Common/Models/DatabaseModels/Monitor";
import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI";
import API from "Common/UI/Utils/API/API";
import PageLoader from "Common/UI/Components/Loader/PageLoader";
import ErrorMessage from "Common/UI/Components/ErrorMessage/ErrorMessage";
const MonitorMetrics: FunctionComponent<
PageComponentProps
> = (props: PageComponentProps): ReactElement => {
> = (): ReactElement => {
const modelId: ObjectID = Navigation.getLastParamAsObjectID(1);
const [_currentTab, setCurrentTab] = useState<Tab | null>(null);
const [monitorType, setMonitorType] = useState<MonitorType | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(true);
const [error, setError] = useState<string>("");
const incidentQuery: Query<Incident> = {
projectId: ProjectUtil.getCurrentProjectId()!,
monitors: new Includes([modelId]),
};
useEffect(() => {
setIsLoading(true);
ModelAPI.getItem({
modelType: Monitor,
id: modelId,
select: { monitorType: true },
})
.then((item: Monitor | null) => {
setMonitorType(item?.monitorType || null);
setIsLoading(false);
})
.catch((err: Error) => {
setError(API.getFriendlyMessage(err));
setIsLoading(false);
});
}, []);
const alertQuery: Query<Alert> = {
projectId: ProjectUtil.getCurrentProjectId()!,
monitor: modelId,
};
if (isLoading) {
return <PageLoader isVisible={true} />;
}
const tabs: Array<Tab> = [
{
if (error) {
return <ErrorMessage message={error} />;
}
const hasMonitorMetrics: boolean =
monitorType !== null &&
MonitorMetricTypeUtil.getMonitorMetricTypesByMonitorType(monitorType)
.length > 0;
const tabs: Array<Tab> = [];
if (hasMonitorMetrics) {
tabs.push({
name: "Monitor Metrics",
children: <MonitorMetricsElement monitorId={modelId} />,
},
{
name: "Incidents",
children: (
<IncidentsTable
query={incidentQuery}
noItemsMessage="No incidents found for this monitor."
title="Monitor Incidents"
description="Incidents associated with this monitor."
/>
),
},
{
name: "Alerts",
children: (
<AlertsTable
query={alertQuery}
noItemsMessage="No alerts found for this monitor."
title="Monitor Alerts"
description="Alerts associated with this monitor."
createInitialValues={{
monitor: modelId,
}}
/>
),
},
{
name: "Status Timeline",
children: (
<ModelTable<MonitorStatusTimeline>
modelType={MonitorStatusTimeline}
id="table-monitor-status-timeline"
name="Monitor > Status Timeline"
userPreferencesKey="monitor-status-timeline-table"
isDeleteable={true}
showViewIdButton={true}
isCreateable={true}
isViewable={false}
query={{
monitorId: modelId,
projectId: ProjectUtil.getCurrentProjectId()!,
}}
sortBy="startsAt"
sortOrder={SortOrder.Descending}
onBeforeCreate={(
item: MonitorStatusTimeline,
): Promise<MonitorStatusTimeline> => {
if (!props.currentProject || !props.currentProject._id) {
throw new BadDataException("Project ID cannot be null");
}
item.monitorId = modelId;
item.projectId = new ObjectID(props.currentProject._id);
return Promise.resolve(item);
}}
cardProps={{
title: "Status Timeline",
description: "Here is the status timeline for this monitor",
}}
noItemsMessage={
"No status timeline created for this monitor so far."
}
formFields={[
{
field: {
monitorStatus: true,
},
title: "Monitor Status",
fieldType: FormFieldSchemaType.Dropdown,
required: true,
placeholder: "Monitor Status",
dropdownModal: {
type: MonitorStatus,
labelField: "name",
valueField: "_id",
},
},
{
field: {
startsAt: true,
},
title: "Starts At",
fieldType: FormFieldSchemaType.DateTime,
required: true,
placeholder: "Starts At",
getDefaultValue: () => {
return OneUptimeDate.getCurrentDate();
},
},
]}
showRefreshButton={true}
viewPageRoute={Navigation.getCurrentRoute()}
filters={[
{
field: {
monitorStatus: {
name: true,
},
},
title: "Monitor Status",
type: FieldType.Entity,
filterEntityType: MonitorStatus,
filterQuery: {
projectId: ProjectUtil.getCurrentProjectId()!,
},
filterDropdownField: {
label: "name",
value: "_id",
},
},
{
field: {
startsAt: true,
},
title: "Starts At",
type: FieldType.Date,
},
{
field: {
endsAt: true,
},
title: "Ends At",
type: FieldType.Date,
},
]}
columns={[
{
field: {
monitorStatus: {
name: true,
color: true,
},
},
title: "Monitor Status",
type: FieldType.Text,
getElement: (item: MonitorStatusTimeline): ReactElement => {
if (!item["monitorStatus"]) {
throw new BadDataException("Monitor Status not found");
}
});
}
return (
<Statusbubble
color={item.monitorStatus.color || Black}
shouldAnimate={false}
text={item.monitorStatus.name || "Unknown"}
/>
);
},
},
{
field: {
startsAt: true,
},
title: "Starts At",
type: FieldType.DateTime,
},
{
field: {
endsAt: true,
},
title: "Ends At",
type: FieldType.DateTime,
noValueMessage: "Currently Active",
},
{
field: {
endsAt: true,
},
title: "Duration",
type: FieldType.Text,
getElement: (item: MonitorStatusTimeline): ReactElement => {
return (
<p>
{OneUptimeDate.differenceBetweenTwoDatesAsFromattedString(
item["startsAt"] as Date,
(item["endsAt"] as Date) || OneUptimeDate.getCurrentDate(),
)}
</p>
);
},
},
]}
/>
),
},
];
tabs.push({
name: "Incident Metrics",
children: <MonitorIncidentMetrics monitorId={modelId} />,
});
tabs.push({
name: "Alert Metrics",
children: <MonitorAlertMetrics monitorId={modelId} />,
});
return (
<Fragment>

View File

@@ -46,18 +46,16 @@ const DashboardSideMenu: FunctionComponent<ComponentProps> = (
},
];
if (MonitorTypeHelper.doesMonitorTypeHaveGraphs(props.monitorType)) {
overviewItems.push({
link: {
title: "Metrics",
to: RouteUtil.populateRouteParams(
RouteMap[PageMap.MONITOR_VIEW_METRICS] as Route,
{ modelId: props.modelId },
),
},
icon: IconProp.Graph,
});
}
overviewItems.push({
link: {
title: "Metrics",
to: RouteUtil.populateRouteParams(
RouteMap[PageMap.MONITOR_VIEW_METRICS] as Route,
{ modelId: props.modelId },
),
},
icon: IconProp.Graph,
});
overviewItems.push({
link: {

View File

@@ -16,6 +16,15 @@ import ConfirmModal from "Common/UI/Components/Modal/ConfirmModal";
import ModelAPI from "Common/UI/Utils/ModelAPI/ModelAPI";
import API from "Common/UI/Utils/API/API";
import UUID from "Common/Utils/UUID";
import ComponentID from "Common/Types/Workflow/ComponentID";
import { JSONObject } from "Common/Types/JSON";
import {
ComponentType,
NodeDataProp,
NodeType,
} from "Common/Types/Workflow/Component";
import { useAsyncEffect } from "use-async-effect";
import { Node } from "reactflow";
const Settings: FunctionComponent<PageComponentProps> = (): ReactElement => {
const modelId: ObjectID = Navigation.getLastParamAsObjectID(1);
@@ -23,6 +32,44 @@ const Settings: FunctionComponent<PageComponentProps> = (): ReactElement => {
useState<boolean>(false);
const [refresher, setRefresher] = useState<boolean>(false);
const [error, setError] = useState<string>("");
const [isWebhookTrigger, setIsWebhookTrigger] = useState<boolean>(false);
useAsyncEffect(async () => {
try {
const workflow: Workflow | null = await ModelAPI.getItem({
modelType: Workflow,
id: modelId,
select: {
graph: true,
},
requestOptions: {},
});
if (
workflow?.graph &&
(workflow.graph as JSONObject)["nodes"]
) {
const nodes: Array<JSONObject> = (workflow.graph as JSONObject)[
"nodes"
] as Array<JSONObject>;
for (const node of nodes) {
const nodeData: NodeDataProp = node["data"] as any;
if (
nodeData.componentType === ComponentType.Trigger &&
nodeData.nodeType === NodeType.Node &&
nodeData.metadataId === ComponentID.Webhook
) {
setIsWebhookTrigger(true);
break;
}
}
}
} catch (_err) {
// ignore - just don't show the webhook section
}
}, []);
const resetSecretKey: () => void = (): void => {
setShowResetConfirmation(false);
@@ -44,46 +91,48 @@ const Settings: FunctionComponent<PageComponentProps> = (): ReactElement => {
return (
<Fragment>
<CardModelDetail<Workflow>
name="Workflow > Webhook Secret Key"
cardProps={{
title: "Webhook Secret Key",
description:
"This secret key is used to trigger this workflow via webhook. Use this key in the webhook URL instead of the workflow ID for security. You can reset this key if it is compromised.",
buttons: [
{
title: "Reset Secret Key",
buttonStyle: ButtonStyleType.DANGER_OUTLINE,
onClick: () => {
setShowResetConfirmation(true);
{isWebhookTrigger && (
<CardModelDetail<Workflow>
name="Workflow > Webhook Secret Key"
cardProps={{
title: "Webhook Secret Key",
description:
"This secret key is used to trigger this workflow via webhook. Use this key in the webhook URL instead of the workflow ID for security. You can reset this key if it is compromised.",
buttons: [
{
title: "Reset Secret Key",
buttonStyle: ButtonStyleType.DANGER_OUTLINE,
onClick: () => {
setShowResetConfirmation(true);
},
icon: IconProp.Refresh,
},
icon: IconProp.Refresh,
},
],
}}
isEditable={false}
refresher={refresher}
modelDetailProps={{
showDetailsInNumberOfColumns: 1,
modelType: Workflow,
id: "model-detail-workflow-webhook-secret",
fields: [
{
field: {
webhookSecretKey: true,
],
}}
isEditable={false}
refresher={refresher}
modelDetailProps={{
showDetailsInNumberOfColumns: 1,
modelType: Workflow,
id: "model-detail-workflow-webhook-secret",
fields: [
{
field: {
webhookSecretKey: true,
},
fieldType: FieldType.HiddenText,
title: "Webhook Secret Key",
placeholder:
"No secret key generated yet. Save the workflow to generate one.",
opts: {
isCopyable: true,
},
},
fieldType: FieldType.HiddenText,
title: "Webhook Secret Key",
placeholder:
"No secret key generated yet. Save the workflow to generate one.",
opts: {
isCopyable: true,
},
},
],
modelId: modelId,
}}
/>
],
modelId: modelId,
}}
/>
)}
{showResetConfirmation && (
<ConfirmModal

View File

@@ -15,6 +15,7 @@ import logger from "Common/Server/Utils/Logger";
import Realtime from "Common/Server/Utils/Realtime";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import "ejs";
import OpenAPIUtil from "Common/Server/Utils/OpenAPI";
@@ -27,6 +28,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
const statusCheck: PromiseVoidFunction = async (): Promise<void> => {
// Check the status of infrastructure components
return await InfrastructureStatus.checkStatusWithRetry({

View File

@@ -406,6 +406,9 @@ export const IpWhitelist: string = process.env["IP_WHITELIST"] || "";
export const DisableTelemetry: boolean =
process.env["DISABLE_TELEMETRY"] === "true";
export const EnableProfiling: boolean =
process.env["ENABLE_PROFILING"] === "true";
export const IsEnterpriseEdition: boolean =
process.env["IS_ENTERPRISE_EDITION"] === "true";

View File

@@ -31,6 +31,8 @@ import { IsBillingEnabled } from "../EnvironmentConfig";
import logger from "../Utils/Logger";
import TelemetryUtil from "../Utils/Telemetry/Telemetry";
import MetricService from "./MetricService";
import GlobalConfigService from "./GlobalConfigService";
import GlobalConfig from "../../Models/DatabaseModels/GlobalConfig";
import OneUptimeDate from "../../Types/Date";
import Metric, {
MetricPointType,
@@ -1060,6 +1062,41 @@ ${alertSeverity.name}
});
}
private static readonly DEFAULT_METRIC_RETENTION_DAYS: number = 180;
private async getMetricRetentionDays(): Promise<number> {
try {
const globalConfig: GlobalConfig | null =
await GlobalConfigService.findOneBy({
query: {
_id: ObjectID.getZeroObjectID().toString(),
},
props: {
isRoot: true,
},
select: {
monitorMetricRetentionInDays: true,
},
});
if (
globalConfig &&
globalConfig.monitorMetricRetentionInDays !== undefined &&
globalConfig.monitorMetricRetentionInDays !== null &&
globalConfig.monitorMetricRetentionInDays > 0
) {
return globalConfig.monitorMetricRetentionInDays;
}
} catch (error) {
logger.error(
"Error fetching metric retention config, using default:",
);
logger.error(error);
}
return Service.DEFAULT_METRIC_RETENTION_DAYS;
}
@CaptureSpan()
public async refreshAlertMetrics(data: { alertId: ObjectID }): Promise<void> {
const alert: Model | null = await this.findOneById({
@@ -1130,6 +1167,12 @@ ${alertSeverity.name}
const itemsToSave: Array<Metric> = [];
const metricTypesMap: Dictionary<MetricType> = {};
const metricRetentionDays: number = await this.getMetricRetentionDays();
const alertMetricRetentionDate: Date = OneUptimeDate.addRemoveDays(
OneUptimeDate.getCurrentDate(),
metricRetentionDays,
);
// now we need to create new metrics for this alert - TimeToAcknowledge, TimeToResolve, AlertCount, AlertDuration
const alertStartsAt: Date =
firstAlertStateTimeline?.startsAt ||
@@ -1160,6 +1203,7 @@ ${alertSeverity.name}
alertCountMetric.time,
);
alertCountMetric.metricPointType = MetricPointType.Sum;
alertCountMetric.retentionDate = alertMetricRetentionDate;
itemsToSave.push(alertCountMetric);
@@ -1214,6 +1258,7 @@ ${alertSeverity.name}
timeToAcknowledgeMetric.time,
);
timeToAcknowledgeMetric.metricPointType = MetricPointType.Sum;
timeToAcknowledgeMetric.retentionDate = alertMetricRetentionDate;
itemsToSave.push(timeToAcknowledgeMetric);
@@ -1270,6 +1315,7 @@ ${alertSeverity.name}
timeToResolveMetric.time,
);
timeToResolveMetric.metricPointType = MetricPointType.Sum;
timeToResolveMetric.retentionDate = alertMetricRetentionDate;
itemsToSave.push(timeToResolveMetric);
@@ -1319,6 +1365,7 @@ ${alertSeverity.name}
alertDurationMetric.time,
);
alertDurationMetric.metricPointType = MetricPointType.Sum;
alertDurationMetric.retentionDate = alertMetricRetentionDate;
itemsToSave.push(alertDurationMetric);

View File

@@ -36,6 +36,8 @@ import MonitorStatusTimeline from "../../Models/DatabaseModels/MonitorStatusTime
import User from "../../Models/DatabaseModels/User";
import { IsBillingEnabled } from "../EnvironmentConfig";
import MetricService from "./MetricService";
import GlobalConfigService from "./GlobalConfigService";
import GlobalConfig from "../../Models/DatabaseModels/GlobalConfig";
import IncidentMetricType from "../../Types/Incident/IncidentMetricType";
import Metric, {
MetricPointType,
@@ -1396,6 +1398,12 @@ ${incident.remediationNotes || "No remediation notes provided."}
postmortemMetric.time,
);
postmortemMetric.metricPointType = MetricPointType.Sum;
const postmortemRetentionDays: number =
await this.getMetricRetentionDays();
postmortemMetric.retentionDate = OneUptimeDate.addRemoveDays(
OneUptimeDate.getCurrentDate(),
postmortemRetentionDays,
);
await MetricService.create({
data: postmortemMetric,
@@ -1583,6 +1591,12 @@ ${incidentSeverity.name}
severityChangeMetric.time,
);
severityChangeMetric.metricPointType = MetricPointType.Sum;
const severityRetentionDays: number =
await this.getMetricRetentionDays();
severityChangeMetric.retentionDate = OneUptimeDate.addRemoveDays(
OneUptimeDate.getCurrentDate(),
severityRetentionDays,
);
await MetricService.create({
data: severityChangeMetric,
@@ -2075,6 +2089,41 @@ ${incidentSeverity.name}
});
}
private static readonly DEFAULT_METRIC_RETENTION_DAYS: number = 180;
private async getMetricRetentionDays(): Promise<number> {
try {
const globalConfig: GlobalConfig | null =
await GlobalConfigService.findOneBy({
query: {
_id: ObjectID.getZeroObjectID().toString(),
},
props: {
isRoot: true,
},
select: {
monitorMetricRetentionInDays: true,
},
});
if (
globalConfig &&
globalConfig.monitorMetricRetentionInDays !== undefined &&
globalConfig.monitorMetricRetentionInDays !== null &&
globalConfig.monitorMetricRetentionInDays > 0
) {
return globalConfig.monitorMetricRetentionInDays;
}
} catch (error) {
logger.error(
"Error fetching metric retention config, using default:",
);
logger.error(error);
}
return Service.DEFAULT_METRIC_RETENTION_DAYS;
}
@CaptureSpan()
public async refreshIncidentMetrics(data: {
incidentId: ObjectID;
@@ -2223,6 +2272,12 @@ ${incidentSeverity.name}
const itemsToSave: Array<Metric> = [];
const metricRetentionDays: number = await this.getMetricRetentionDays();
const incidentMetricRetentionDate: Date = OneUptimeDate.addRemoveDays(
OneUptimeDate.getCurrentDate(),
metricRetentionDays,
);
// now we need to create new metrics for this incident - TimeToAcknowledge, TimeToResolve, IncidentCount, IncidentDuration
const incidentStartsAt: Date =
@@ -2270,6 +2325,7 @@ ${incidentSeverity.name}
incidentCountMetric.time,
);
incidentCountMetric.metricPointType = MetricPointType.Sum;
incidentCountMetric.retentionDate = incidentMetricRetentionDate;
itemsToSave.push(incidentCountMetric);
@@ -2321,6 +2377,7 @@ ${incidentSeverity.name}
timeToAcknowledgeMetric.time,
);
timeToAcknowledgeMetric.metricPointType = MetricPointType.Sum;
timeToAcknowledgeMetric.retentionDate = incidentMetricRetentionDate;
itemsToSave.push(timeToAcknowledgeMetric);
@@ -2374,6 +2431,7 @@ ${incidentSeverity.name}
timeToResolveMetric.time,
);
timeToResolveMetric.metricPointType = MetricPointType.Sum;
timeToResolveMetric.retentionDate = incidentMetricRetentionDate;
itemsToSave.push(timeToResolveMetric);
@@ -2422,6 +2480,7 @@ ${incidentSeverity.name}
incidentDurationMetric.time,
);
incidentDurationMetric.metricPointType = MetricPointType.Sum;
incidentDurationMetric.retentionDate = incidentMetricRetentionDate;
itemsToSave.push(incidentDurationMetric);
@@ -2474,6 +2533,7 @@ ${incidentSeverity.name}
timeInStateMetric.time,
);
timeInStateMetric.metricPointType = MetricPointType.Sum;
timeInStateMetric.retentionDate = incidentMetricRetentionDate;
itemsToSave.push(timeInStateMetric);
}

View File

@@ -676,6 +676,7 @@ export class LogAggregationService {
bodySearchText?: string | undefined;
traceIds?: Array<string> | undefined;
spanIds?: Array<string> | undefined;
attributes?: Record<string, string> | undefined;
}): Promise<Array<JSONObject>> {
const maxLimit: number = Math.min(request.limit || 10000, 10000);

View File

@@ -1,6 +1,5 @@
import { WorkflowHostname } from "../EnvironmentConfig";
import ClusterKeyAuthorization from "../Middleware/ClusterKeyAuthorization";
import CreateBy from "../Types/Database/CreateBy";
import { OnCreate, OnUpdate } from "../Types/Database/Hooks";
import DatabaseService from "./DatabaseService";
import EmptyResponseData from "../../Types/API/EmptyResponse";
@@ -26,18 +25,29 @@ export class Service extends DatabaseService<Model> {
}
@CaptureSpan()
protected override async onBeforeCreate(
createBy: CreateBy<Model>,
): Promise<OnCreate<Model>> {
protected override async onCreateSuccess(
_onCreate: OnCreate<Model>,
createdItem: Model,
): Promise<Model> {
// Auto-generate webhook secret key for new workflows.
if (!createBy.data.webhookSecretKey) {
createBy.data.webhookSecretKey = UUID.generate();
if (!createdItem.webhookSecretKey && createdItem._id) {
const secretKey: string = UUID.generate();
await this.updateOneById({
id: new ObjectID(createdItem._id),
data: {
webhookSecretKey: secretKey,
} as any,
props: {
isRoot: true,
ignoreHooks: true,
},
});
createdItem.webhookSecretKey = secretKey;
}
return {
createBy,
carryForward: null,
};
return createdItem;
}
@CaptureSpan()

View File

@@ -1,4 +1,5 @@
import logger from "../Logger";
import LogAggregationService from "../../Services/LogAggregationService";
import VMUtil from "../VM/VMAPI";
import APIRequestCriteria from "./Criteria/APIRequestCriteria";
import CustomCodeMonitoringCriteria from "./Criteria/CustomCodeMonitorCriteria";
@@ -116,7 +117,7 @@ export default class MonitorCriteriaEvaluator {
`;
const contextBlock: string | null =
MonitorCriteriaEvaluator.buildRootCauseContext({
await MonitorCriteriaEvaluator.buildRootCauseContext({
dataToProcess: input.dataToProcess,
monitorStep: input.monitorStep,
monitor: input.monitor,
@@ -557,14 +558,16 @@ ${contextBlock}
return null;
}
private static buildRootCauseContext(input: {
private static async buildRootCauseContext(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
}): string | null {
}): Promise<string | null> {
// Handle Kubernetes monitors with rich resource context
if (input.monitor.monitorType === MonitorType.Kubernetes) {
return MonitorCriteriaEvaluator.buildKubernetesRootCauseContext(input);
return await MonitorCriteriaEvaluator.buildKubernetesRootCauseContext(
input,
);
}
const requestDetails: Array<string> = [];
@@ -675,11 +678,11 @@ ${contextBlock}
return sections.join("\n");
}
private static buildKubernetesRootCauseContext(input: {
private static async buildKubernetesRootCauseContext(input: {
dataToProcess: DataToProcess;
monitorStep: MonitorStep;
monitor: Monitor;
}): string | null {
}): Promise<string | null> {
const metricResponse: MetricMonitorResponse =
input.dataToProcess as MetricMonitorResponse;
@@ -730,7 +733,7 @@ ${contextBlock}
);
if (sortedResources.length === 0) {
continue;
return sections.join("\n");
}
// Show top 10 affected resources
@@ -833,6 +836,73 @@ ${contextBlock}
if (analysis) {
sections.push(`\n\n**Root Cause Analysis**\n${analysis}`);
}
// Fetch recent container logs for the top affected resource during CrashLoopBackOff
if (
(breakdown.metricName === "k8s.container.restarts" ||
breakdown.metricName.includes("restart")) &&
input.monitor.projectId
) {
const topResource: KubernetesAffectedResource = resourcesToShow[0]!;
try {
const logAttributes: Record<string, string> = {};
if (breakdown.clusterName) {
logAttributes["resource.k8s.cluster.name"] =
breakdown.clusterName;
}
if (topResource.podName) {
logAttributes["resource.k8s.pod.name"] = topResource.podName;
}
if (topResource.containerName) {
logAttributes["resource.k8s.container.name"] =
topResource.containerName;
}
if (topResource.namespace) {
logAttributes["resource.k8s.namespace.name"] =
topResource.namespace;
}
const now: Date = OneUptimeDate.getCurrentDate();
const fifteenMinutesAgo: Date =
OneUptimeDate.addRemoveMinutes(now, -15);
const logs: Array<JSONObject> =
await LogAggregationService.getExportLogs({
projectId: input.monitor.projectId,
startTime: fifteenMinutesAgo,
endTime: now,
limit: 50,
attributes: logAttributes,
});
if (logs.length > 0) {
const logLines: Array<string> = logs.map((log: JSONObject) => {
const timestamp: string = log["time"]
? String(log["time"])
: "";
const severity: string = log["severityText"]
? String(log["severityText"])
: "INFO";
const body: string = log["body"] ? String(log["body"]) : "";
return `\`${timestamp}\` **${severity}** ${body}`;
});
sections.push(
`\n\n**Recent Container Logs** (${topResource.podName || "unknown pod"} / ${topResource.containerName || "unknown container"}, last 15 minutes)\n\n${logLines.join("\n\n")}`,
);
}
} catch (err) {
logger.error(
"Failed to fetch container logs for root cause context",
);
logger.error(err);
}
}
}
return sections.join("\n");

View File

@@ -0,0 +1,581 @@
import inspector from "inspector";
import http from "http";
import https from "https";
import zlib from "zlib";
import { URL as NodeURL } from "url";
import Dictionary from "../../Types/Dictionary";
import {
AppVersion,
Env,
DisableTelemetry,
EnableProfiling,
} from "../EnvironmentConfig";
import logger from "./Logger";
// V8 CPU Profile types from the inspector module
interface V8CallFrame {
functionName: string;
scriptId: string;
url: string;
lineNumber: number;
columnNumber: number;
}
interface V8CpuProfileNode {
id: number;
callFrame: V8CallFrame;
hitCount: number;
children?: Array<number>;
}
interface V8CpuProfile {
nodes: Array<V8CpuProfileNode>;
startTime: number; // microseconds (monotonic clock)
endTime: number; // microseconds (monotonic clock)
samples: Array<number>; // node IDs
timeDeltas: Array<number>; // microseconds between samples
}
export default class Profiling {
private static session: inspector.Session | null = null;
private static intervalId: ReturnType<typeof setInterval> | null = null;
private static serviceName: string = "";
private static isCollecting: boolean = false;
// Profile every 60 seconds, sample for 10 seconds each time
private static readonly PROFILING_INTERVAL_MS: number = 60_000;
private static readonly PROFILING_DURATION_MS: number = 10_000;
public static init(data: { serviceName: string }): void {
if (!EnableProfiling) {
return;
}
if (DisableTelemetry) {
return;
}
const endpoint: string | null = this.getOtlpProfilesEndpoint();
const headers: Dictionary<string> = this.getHeaders();
if (!endpoint || Object.keys(headers).length === 0) {
logger.warn(
"Profiling enabled but OTLP endpoint or headers not configured. Skipping profiling initialization.",
);
return;
}
this.serviceName = data.serviceName;
try {
this.session = new inspector.Session();
this.session.connect();
this.postToSession("Profiler.enable")
.then(() => {
logger.info(
`CPU profiling initialized for service: ${data.serviceName}`,
);
this.startProfilingLoop();
})
.catch((err: unknown) => {
logger.error("Failed to enable V8 profiler:");
logger.error(err);
});
} catch (err) {
logger.error("Failed to initialize profiling session:");
logger.error(err);
}
process.on("SIGTERM", () => {
this.stop();
});
}
public static stop(): void {
if (this.intervalId) {
clearInterval(this.intervalId);
this.intervalId = null;
}
if (this.session) {
try {
this.session.post("Profiler.disable");
this.session.disconnect();
} catch {
// Ignore errors during cleanup
}
this.session = null;
}
}
private static getOtlpProfilesEndpoint(): string | null {
const base: string | undefined =
process.env["OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT"];
if (!base) {
return null;
}
return `${base}/v1/profiles`;
}
private static getHeaders(): Dictionary<string> {
if (!process.env["OPENTELEMETRY_EXPORTER_OTLP_HEADERS"]) {
return {};
}
const headersStrings: Array<string> =
process.env["OPENTELEMETRY_EXPORTER_OTLP_HEADERS"].split(";");
const headers: Dictionary<string> = {};
for (const headerString of headersStrings) {
const parts: Array<string> = headerString.split("=");
if (parts.length === 2) {
headers[parts[0]!.toString()] = parts[1]!.toString();
}
}
return headers;
}
private static startProfilingLoop(): void {
// Start the first collection after a short delay
setTimeout(() => {
this.collectAndSendProfile().catch((err: unknown) => {
logger.error("Error in initial profile collection:");
logger.error(err);
});
}, 5000);
this.intervalId = setInterval(() => {
this.collectAndSendProfile().catch((err: unknown) => {
logger.error("Error in profile collection:");
logger.error(err);
});
}, this.PROFILING_INTERVAL_MS);
}
private static async collectAndSendProfile(): Promise<void> {
if (!this.session || this.isCollecting) {
return;
}
this.isCollecting = true;
const wallClockStartMs: number = Date.now();
try {
await this.postToSession("Profiler.start");
await new Promise<void>((resolve: () => void) => {
return setTimeout(resolve, this.PROFILING_DURATION_MS);
});
const wallClockEndMs: number = Date.now();
const result: unknown = await this.postToSession("Profiler.stop");
const profile: V8CpuProfile | undefined = (
result as { profile?: V8CpuProfile }
)?.profile;
if (!profile || !profile.samples || profile.samples.length === 0) {
return;
}
const otlpPayload: object = this.convertV8ProfileToOTLP(
profile,
wallClockStartMs,
wallClockEndMs,
);
await this.sendProfile(otlpPayload);
} catch (err) {
logger.error("Error collecting/sending profile:");
logger.error(err);
} finally {
this.isCollecting = false;
}
}
private static postToSession(
method: string,
params?: object,
): Promise<unknown> {
return new Promise<unknown>(
(resolve: (value: unknown) => void, reject: (reason: Error) => void) => {
if (!this.session) {
reject(new Error("Inspector session not available"));
return;
}
this.session.post(
method,
params || {},
(err: Error | null, result?: object) => {
if (err) {
reject(err);
} else {
resolve(result);
}
},
);
},
);
}
private static convertV8ProfileToOTLP(
v8Profile: V8CpuProfile,
wallClockStartMs: number,
wallClockEndMs: number,
): object {
// Build node lookup and parent maps
const nodeMap: Map<number, V8CpuProfileNode> = new Map<
number,
V8CpuProfileNode
>();
const parentMap: Map<number, number> = new Map<number, number>();
for (const node of v8Profile.nodes) {
nodeMap.set(node.id, node);
if (node.children) {
for (const childId of node.children) {
parentMap.set(childId, node.id);
}
}
}
// String table with deduplication
const stringTable: Array<string> = [""];
const stringIndexMap: Map<string, number> = new Map<string, number>();
stringIndexMap.set("", 0);
const getStringIndex: (s: string) => number = (s: string): number => {
let idx: number | undefined = stringIndexMap.get(s);
if (idx === undefined) {
idx = stringTable.length;
stringTable.push(s);
stringIndexMap.set(s, idx);
}
return idx;
};
// Predefined string indices for sample types
const cpuTypeIdx: number = getStringIndex("cpu");
const nanosecondsIdx: number = getStringIndex("nanoseconds");
const samplesTypeIdx: number = getStringIndex("samples");
const countIdx: number = getStringIndex("count");
// Build function and location tables
const functionTable: Array<{ name: number; filename: number }> = [];
const locationTable: Array<{
line: Array<{ functionIndex: number; line: number }>;
}> = [];
const funcIndexMap: Map<string, number> = new Map<string, number>();
const locationIndexMap: Map<string, number> = new Map<string, number>();
const getLocationIndex: (node: V8CpuProfileNode) => number = (
node: V8CpuProfileNode,
): number => {
const locKey: string = `${node.callFrame.functionName}|${node.callFrame.url}|${node.callFrame.lineNumber}`;
let locIdx: number | undefined = locationIndexMap.get(locKey);
if (locIdx !== undefined) {
return locIdx;
}
// Ensure function entry exists
const fKey: string = `${node.callFrame.functionName}|${node.callFrame.url}`;
let fIdx: number | undefined = funcIndexMap.get(fKey);
if (fIdx === undefined) {
fIdx = functionTable.length;
functionTable.push({
name: getStringIndex(node.callFrame.functionName || "(anonymous)"),
filename: getStringIndex(node.callFrame.url || ""),
});
funcIndexMap.set(fKey, fIdx);
}
locIdx = locationTable.length;
locationTable.push({
line: [
{
functionIndex: fIdx,
line: Math.max(0, node.callFrame.lineNumber + 1), // V8 uses 0-based line numbers
},
],
});
locationIndexMap.set(locKey, locIdx);
return locIdx;
};
// Build stack table from samples
const stackTable: Array<{ locationIndices: Array<number> }> = [];
const stackKeyMap: Map<string, number> = new Map<string, number>();
const getStackIndex: (leafNodeId: number) => number = (
leafNodeId: number,
): number => {
const locationIndices: Array<number> = [];
let currentId: number | undefined = leafNodeId;
while (currentId !== undefined) {
const node: V8CpuProfileNode | undefined = nodeMap.get(currentId);
if (!node) {
break;
}
// Skip V8 internal nodes
const fnName: string = node.callFrame.functionName;
if (
fnName !== "(root)" &&
fnName !== "(program)" &&
fnName !== "(idle)" &&
fnName !== "(garbage collector)"
) {
locationIndices.push(getLocationIndex(node));
}
currentId = parentMap.get(currentId);
}
const key: string = locationIndices.join(",");
let stackIdx: number | undefined = stackKeyMap.get(key);
if (stackIdx === undefined) {
stackIdx = stackTable.length;
stackTable.push({ locationIndices });
stackKeyMap.set(key, stackIdx);
}
return stackIdx;
};
// Use wall clock for absolute timestamps (V8 uses monotonic clock)
const NANOS_PER_MS: bigint = BigInt(1000000);
const NANOS_PER_US: bigint = BigInt(1000);
const ZERO: bigint = BigInt(0);
const startTimeNano: bigint = BigInt(wallClockStartMs) * NANOS_PER_MS;
const endTimeNano: bigint = BigInt(wallClockEndMs) * NANOS_PER_MS;
// Build sample entries
const samples: Array<{
stackIndex: number;
value: Array<string>;
timestampsUnixNano: Array<string>;
}> = [];
let cumulativeDeltaNano: bigint = ZERO;
const totalV8DurationUs: bigint = BigInt(
v8Profile.endTime - v8Profile.startTime,
);
const totalWallDurationNano: bigint = endTimeNano - startTimeNano;
for (let i: number = 0; i < v8Profile.samples.length; i++) {
const nodeId: number = v8Profile.samples[i]!;
const node: V8CpuProfileNode | undefined = nodeMap.get(nodeId);
// Accumulate time delta
const deltaUs: bigint = BigInt(v8Profile.timeDeltas[i] || 0);
cumulativeDeltaNano = cumulativeDeltaNano + deltaUs * NANOS_PER_US;
if (!node) {
continue;
}
// Skip idle/root/program/gc samples
const fnName: string = node.callFrame.functionName;
if (
fnName === "(idle)" ||
fnName === "(root)" ||
fnName === "(program)" ||
fnName === "(garbage collector)"
) {
continue;
}
// Map V8 monotonic time to wall clock time proportionally
const sampleTimeNano: bigint =
totalV8DurationUs > ZERO
? startTimeNano +
(cumulativeDeltaNano * totalWallDurationNano) /
(totalV8DurationUs * NANOS_PER_US)
: startTimeNano + cumulativeDeltaNano;
const timeDeltaNano: bigint = deltaUs * NANOS_PER_US;
const stackIndex: number = getStackIndex(nodeId);
samples.push({
stackIndex,
value: [timeDeltaNano.toString(), "1"],
timestampsUnixNano: [sampleTimeNano.toString()],
});
}
// If no meaningful samples were collected, return an empty payload
if (samples.length === 0) {
return { resourceProfiles: [] };
}
// Compute average sampling period in nanoseconds
const avgPeriodNs: number =
v8Profile.samples.length > 0
? Math.trunc(
((v8Profile.endTime - v8Profile.startTime) * 1000) /
v8Profile.samples.length,
)
: 1_000_000; // default 1ms
// Generate a random profile ID (16 bytes as base64)
const profileIdBytes: Buffer = Buffer.alloc(16);
for (let i: number = 0; i < 16; i++) {
profileIdBytes[i] = Math.floor(Math.random() * 256);
}
const profileId: string = profileIdBytes.toString("base64");
return {
resourceProfiles: [
{
resource: {
attributes: [
{
key: "service.name",
value: { stringValue: this.serviceName },
},
{
key: "service.version",
value: { stringValue: AppVersion },
},
{
key: "deployment.environment",
value: { stringValue: Env },
},
],
},
scopeProfiles: [
{
scope: {
name: "oneuptime-node-profiler",
version: "1.0.0",
},
profiles: [
{
profileId: profileId,
startTimeUnixNano: startTimeNano.toString(),
endTimeUnixNano: endTimeNano.toString(),
attributes: [
{
key: "profiler.name",
value: { stringValue: "v8-cpu-profiler" },
},
{
key: "runtime.name",
value: { stringValue: "nodejs" },
},
{
key: "runtime.version",
value: { stringValue: process.version },
},
],
profile: {
stringTable,
sampleType: [
{ type: cpuTypeIdx, unit: nanosecondsIdx },
{ type: samplesTypeIdx, unit: countIdx },
],
sample: samples,
locationTable,
functionTable,
stackTable,
linkTable: [],
attributeTable: [],
periodType: { type: cpuTypeIdx, unit: nanosecondsIdx },
period: avgPeriodNs.toString(),
},
},
],
},
],
},
],
};
}
private static async sendProfile(payload: object): Promise<void> {
const endpoint: string | null = this.getOtlpProfilesEndpoint();
if (!endpoint) {
return;
}
const resourceProfiles: Array<unknown> = (
payload as { resourceProfiles: Array<unknown> }
).resourceProfiles;
if (!resourceProfiles || resourceProfiles.length === 0) {
return;
}
const headers: Dictionary<string> = this.getHeaders();
const jsonData: string = JSON.stringify(payload);
const compressed: Buffer = await new Promise<Buffer>(
(resolve: (value: Buffer) => void, reject: (reason: Error) => void) => {
zlib.gzip(jsonData, (err: Error | null, result: Buffer) => {
if (err) {
reject(err);
} else {
resolve(result);
}
});
},
);
const url: NodeURL = new NodeURL(endpoint);
const isHttps: boolean = url.protocol === "https:";
const httpModule: typeof http | typeof https = isHttps ? https : http;
return new Promise<void>((resolve: () => void) => {
const req: http.ClientRequest = httpModule.request(
{
hostname: url.hostname,
port: url.port || (isHttps ? 443 : 80),
path: url.pathname,
method: "POST",
headers: {
"Content-Type": "application/json",
"Content-Encoding": "gzip",
...headers,
},
},
(res: http.IncomingMessage) => {
let data: string = "";
res.on("data", (chunk: Buffer) => {
data += chunk.toString();
});
res.on("end", () => {
if (
res.statusCode &&
res.statusCode >= 200 &&
res.statusCode < 300
) {
logger.debug(
`Profile sent successfully for service: ${this.serviceName}`,
);
} else {
logger.warn(
`Profile export failed with status ${res.statusCode}: ${data}`,
);
}
resolve();
});
},
);
req.on("error", (err: Error) => {
logger.warn(`Profile export error: ${err.message}`);
resolve(); // Don't throw - profiling failures should not crash the service
});
req.write(compressed);
req.end();
});
}
}

View File

@@ -0,0 +1,15 @@
enum ExceptionMetricType {
ExceptionCount = "oneuptime.exception.count",
ExceptionRate = "oneuptime.exception.rate",
ExceptionCountByType = "oneuptime.exception.count.by.type",
ExceptionCountByService = "oneuptime.exception.count.by.service",
UnresolvedExceptionCount = "oneuptime.exception.unresolved.count",
ResolvedExceptionCount = "oneuptime.exception.resolved.count",
MutedExceptionCount = "oneuptime.exception.muted.count",
ExceptionFirstSeenTime = "oneuptime.exception.first.seen.time",
ExceptionLastSeenTime = "oneuptime.exception.last.seen.time",
ExceptionOccurrenceCount = "oneuptime.exception.occurrence.count",
ExceptionAffectedServiceCount = "oneuptime.exception.affected.service.count",
}
export default ExceptionMetricType;

View File

@@ -0,0 +1,28 @@
enum MetricDashboardMetricType {
// HTTP metrics
HttpRequestDuration = "http.server.request.duration",
HttpRequestCount = "http.server.request.count",
HttpRequestErrorRate = "http.server.request.error.rate",
HttpResponseSize = "http.server.response.body.size",
HttpRequestSize = "http.server.request.body.size",
HttpActiveRequests = "http.server.active_requests",
// System metrics
SystemCpuUtilization = "system.cpu.utilization",
SystemMemoryUsage = "system.memory.usage",
SystemDiskIo = "system.disk.io",
SystemNetworkIo = "system.network.io",
// Runtime metrics
ProcessCpuUtilization = "process.cpu.utilization",
ProcessMemoryUsage = "process.runtime.jvm.memory.usage",
GcDuration = "process.runtime.jvm.gc.duration",
ThreadCount = "process.runtime.jvm.threads.count",
// Custom application metrics
CustomCounter = "custom.counter",
CustomGauge = "custom.gauge",
CustomHistogram = "custom.histogram",
}
export default MetricDashboardMetricType;

View File

@@ -0,0 +1,16 @@
enum ProfileMetricType {
CpuProfileDuration = "oneuptime.profile.cpu.duration",
CpuProfileSampleCount = "oneuptime.profile.cpu.sample.count",
WallClockDuration = "oneuptime.profile.wall.duration",
MemoryAllocationSize = "oneuptime.profile.memory.allocation.size",
MemoryAllocationCount = "oneuptime.profile.memory.allocation.count",
HeapUsage = "oneuptime.profile.heap.usage",
GoroutineCount = "oneuptime.profile.goroutine.count",
ThreadCount = "oneuptime.profile.thread.count",
ProfileSampleRate = "oneuptime.profile.sample.rate",
ProfileCount = "oneuptime.profile.count",
TopFunctionCpuTime = "oneuptime.profile.top.function.cpu.time",
TopFunctionAllocations = "oneuptime.profile.top.function.allocations",
}
export default ProfileMetricType;

View File

@@ -0,0 +1,17 @@
enum SpanMetricType {
SpanCount = "oneuptime.span.count",
SpanDuration = "oneuptime.span.duration",
SpanErrorCount = "oneuptime.span.error.count",
SpanErrorRate = "oneuptime.span.error.rate",
SpanRequestRate = "oneuptime.span.request.rate",
SpanP50Duration = "oneuptime.span.duration.p50",
SpanP90Duration = "oneuptime.span.duration.p90",
SpanP95Duration = "oneuptime.span.duration.p95",
SpanP99Duration = "oneuptime.span.duration.p99",
SpanStatusOk = "oneuptime.span.status.ok",
SpanStatusError = "oneuptime.span.status.error",
SpanStatusUnset = "oneuptime.span.status.unset",
SpanThroughput = "oneuptime.span.throughput",
}
export default SpanMetricType;

View File

@@ -0,0 +1,100 @@
import AggregationType from "../../Types/BaseDatabase/AggregationType";
import AlertMetricType from "../../Types/Alerts/AlertMetricType";
class AlertMetricTypeUtil {
public static getAggregationTypeByAlertMetricType(
metricType: AlertMetricType,
): AggregationType {
switch (metricType) {
case AlertMetricType.AlertCount:
return AggregationType.Sum;
case AlertMetricType.TimeToAcknowledge:
return AggregationType.Avg;
case AlertMetricType.TimeToResolve:
return AggregationType.Avg;
case AlertMetricType.AlertDuration:
return AggregationType.Avg;
default:
throw new Error("Invalid AlertMetricType value");
}
}
public static getAllAlertMetricTypes(): Array<AlertMetricType> {
return [
AlertMetricType.AlertCount,
AlertMetricType.TimeToAcknowledge,
AlertMetricType.TimeToResolve,
AlertMetricType.AlertDuration,
];
}
public static getTitleByAlertMetricType(
metricType: AlertMetricType,
): string {
switch (metricType) {
case AlertMetricType.AlertCount:
return "Alert Count";
case AlertMetricType.TimeToAcknowledge:
return "Time to Acknowledge";
case AlertMetricType.TimeToResolve:
return "Time to Resolve";
case AlertMetricType.AlertDuration:
return "Alert Duration";
default:
return "";
}
}
public static getDescriptionByAlertMetricType(
metricType: AlertMetricType,
): string {
switch (metricType) {
case AlertMetricType.AlertCount:
return "The number of alerts created for this monitor over time.";
case AlertMetricType.TimeToAcknowledge:
return "The average time taken to acknowledge alerts for this monitor.";
case AlertMetricType.TimeToResolve:
return "The average time taken to resolve alerts for this monitor.";
case AlertMetricType.AlertDuration:
return "The average duration of alerts for this monitor.";
default:
return "";
}
}
public static getLegendByAlertMetricType(
metricType: AlertMetricType,
): string {
switch (metricType) {
case AlertMetricType.AlertCount:
return "Alerts";
case AlertMetricType.TimeToAcknowledge:
return "Time to Acknowledge";
case AlertMetricType.TimeToResolve:
return "Time to Resolve";
case AlertMetricType.AlertDuration:
return "Duration";
default:
return "";
}
}
public static getLegendUnitByAlertMetricType(
metricType: AlertMetricType,
): string {
switch (metricType) {
case AlertMetricType.AlertCount:
return "";
case AlertMetricType.TimeToAcknowledge:
return "s";
case AlertMetricType.TimeToResolve:
return "s";
case AlertMetricType.AlertDuration:
return "s";
default:
return "";
}
}
}
export default AlertMetricTypeUtil;

View File

@@ -0,0 +1,130 @@
import AggregationType from "../../Types/BaseDatabase/AggregationType";
import IncidentMetricType from "../../Types/Incident/IncidentMetricType";
class IncidentMetricTypeUtil {
public static getAggregationTypeByIncidentMetricType(
metricType: IncidentMetricType,
): AggregationType {
switch (metricType) {
case IncidentMetricType.IncidentCount:
return AggregationType.Sum;
case IncidentMetricType.TimeToAcknowledge:
return AggregationType.Avg;
case IncidentMetricType.TimeToResolve:
return AggregationType.Avg;
case IncidentMetricType.IncidentDuration:
return AggregationType.Avg;
case IncidentMetricType.TimeInState:
return AggregationType.Avg;
case IncidentMetricType.SeverityChange:
return AggregationType.Sum;
case IncidentMetricType.PostmortemCompletionTime:
return AggregationType.Avg;
default:
throw new Error("Invalid IncidentMetricType value");
}
}
public static getAllIncidentMetricTypes(): Array<IncidentMetricType> {
return [
IncidentMetricType.IncidentCount,
IncidentMetricType.TimeToAcknowledge,
IncidentMetricType.TimeToResolve,
IncidentMetricType.IncidentDuration,
];
}
public static getTitleByIncidentMetricType(
metricType: IncidentMetricType,
): string {
switch (metricType) {
case IncidentMetricType.IncidentCount:
return "Incident Count";
case IncidentMetricType.TimeToAcknowledge:
return "Time to Acknowledge";
case IncidentMetricType.TimeToResolve:
return "Time to Resolve";
case IncidentMetricType.IncidentDuration:
return "Incident Duration";
case IncidentMetricType.TimeInState:
return "Time in State";
case IncidentMetricType.SeverityChange:
return "Severity Changes";
case IncidentMetricType.PostmortemCompletionTime:
return "Postmortem Completion Time";
default:
return "";
}
}
public static getDescriptionByIncidentMetricType(
metricType: IncidentMetricType,
): string {
switch (metricType) {
case IncidentMetricType.IncidentCount:
return "The number of incidents created for this monitor over time.";
case IncidentMetricType.TimeToAcknowledge:
return "The average time taken to acknowledge incidents for this monitor.";
case IncidentMetricType.TimeToResolve:
return "The average time taken to resolve incidents for this monitor.";
case IncidentMetricType.IncidentDuration:
return "The average duration of incidents for this monitor.";
case IncidentMetricType.TimeInState:
return "The average time incidents spend in each state for this monitor.";
case IncidentMetricType.SeverityChange:
return "The number of severity changes for incidents related to this monitor.";
case IncidentMetricType.PostmortemCompletionTime:
return "The average time taken to complete postmortems for incidents related to this monitor.";
default:
return "";
}
}
public static getLegendByIncidentMetricType(
metricType: IncidentMetricType,
): string {
switch (metricType) {
case IncidentMetricType.IncidentCount:
return "Incidents";
case IncidentMetricType.TimeToAcknowledge:
return "Time to Acknowledge";
case IncidentMetricType.TimeToResolve:
return "Time to Resolve";
case IncidentMetricType.IncidentDuration:
return "Duration";
case IncidentMetricType.TimeInState:
return "Time in State";
case IncidentMetricType.SeverityChange:
return "Severity Changes";
case IncidentMetricType.PostmortemCompletionTime:
return "Postmortem Time";
default:
return "";
}
}
public static getLegendUnitByIncidentMetricType(
metricType: IncidentMetricType,
): string {
switch (metricType) {
case IncidentMetricType.IncidentCount:
return "";
case IncidentMetricType.TimeToAcknowledge:
return "s";
case IncidentMetricType.TimeToResolve:
return "s";
case IncidentMetricType.IncidentDuration:
return "s";
case IncidentMetricType.TimeInState:
return "s";
case IncidentMetricType.SeverityChange:
return "";
case IncidentMetricType.PostmortemCompletionTime:
return "s";
default:
return "";
}
}
}
export default IncidentMetricTypeUtil;

View File

@@ -4,6 +4,7 @@ import InfrastructureStatus from "Common/Server/Infrastructure/Status";
import logger from "Common/Server/Utils/Logger";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import "ejs";
const APP_NAME: string = "home";
@@ -15,6 +16,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
const statusCheck: PromiseVoidFunction = async (): Promise<void> => {
// Check the status of infrastructure components
return await InfrastructureStatus.checkStatusWithRetry({

View File

@@ -9,6 +9,7 @@ import Express, { ExpressApplication } from "Common/Server/Utils/Express";
import logger from "Common/Server/Utils/Logger";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
import "ejs";
@@ -75,6 +76,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
// Initialize the app with service name and status checks
await App.init({
appName: APP_NAME,

View File

@@ -16,6 +16,7 @@ import { PromiseVoidFunction } from "Common/Types/FunctionTypes";
import logger from "Common/Server/Utils/Logger";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import Express, { ExpressApplication } from "Common/Server/Utils/Express";
import "ejs";
@@ -47,6 +48,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
logger.info(
`Probe Service - Monitoring workers: ${PROBE_MONITORING_WORKERS}, Monitor fetch limit: ${PROBE_MONITOR_FETCH_LIMIT}, Script timeout: ${PROBE_SYNTHETIC_MONITOR_SCRIPT_TIMEOUT_IN_MS}ms / ${PROBE_CUSTOM_CODE_MONITOR_SCRIPT_TIMEOUT_IN_MS}ms, Retry limit: ${PROBE_MONITOR_RETRY_LIMIT}`,
);

View File

@@ -21,6 +21,7 @@ import logger from "Common/Server/Utils/Logger";
import Realtime from "Common/Server/Utils/Realtime";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import "./Jobs/TelemetryIngest/ProcessTelemetry";
import { TELEMETRY_CONCURRENCY } from "./Config";
import type { StatusAPIOptions } from "Common/Server/API/StatusAPI";
@@ -79,6 +80,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
logger.info(
`Telemetry Service - Queue concurrency: ${TELEMETRY_CONCURRENCY}`,
);

View File

@@ -5,6 +5,7 @@ import Express, { ExpressApplication } from "Common/Server/Utils/Express";
import logger from "Common/Server/Utils/Logger";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import "ejs";
const app: ExpressApplication = Express.getExpressApp();
@@ -21,6 +22,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
// init the app
await App.init({
appName: APP_NAME,

View File

@@ -5,6 +5,7 @@ import InfrastructureStatus from "Common/Server/Infrastructure/Status";
import logger from "Common/Server/Utils/Logger";
import App from "Common/Server/Utils/StartServer";
import Telemetry from "Common/Server/Utils/Telemetry";
import Profiling from "Common/Server/Utils/Profiling";
import Realtime from "Common/Server/Utils/Realtime";
import PostgresAppInstance from "Common/Server/Infrastructure/PostgresDatabase";
import Redis from "Common/Server/Infrastructure/Redis";
@@ -23,6 +24,11 @@ const init: PromiseVoidFunction = async (): Promise<void> => {
serviceName: APP_NAME,
});
// Initialize profiling (opt-in via ENABLE_PROFILING env var)
Profiling.init({
serviceName: APP_NAME,
});
logger.debug("Telemetry initialized");
logger.info(`Worker Service - Queue concurrency: ${WORKER_CONCURRENCY}`);

View File

@@ -305,6 +305,14 @@ DISABLE_TELEMETRY_FOR_WORKER=true
DISABLE_TELEMETRY_FOR_AI_AGENT=true
# By default profiling is disabled for all services. Set to true to enable CPU profiling for a service.
ENABLE_PROFILING_FOR_APP=false
ENABLE_PROFILING_FOR_TELEMETRY=false
ENABLE_PROFILING_FOR_TEST_SERVER=false
ENABLE_PROFILING_FOR_PROBE=false
ENABLE_PROFILING_FOR_WORKER=false
ENABLE_PROFILING_FOR_AI_AGENT=false
# Connect OneUptime with Slack App
SLACK_APP_CLIENT_ID=

View File

@@ -223,6 +223,7 @@ services:
<<: *common-runtime-variables
PORT: ${TEST_SERVER_PORT}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_TEST_SERVER}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_TEST_SERVER}
logging:
driver: "local"
options:
@@ -242,6 +243,7 @@ services:
SMS_HIGH_RISK_COST_IN_CENTS: ${SMS_HIGH_RISK_COST_IN_CENTS}
CALL_HIGH_RISK_COST_IN_CENTS_PER_MINUTE: ${CALL_HIGH_RISK_COST_IN_CENTS_PER_MINUTE}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_APP}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_APP}
logging:
driver: "local"
options:
@@ -267,6 +269,7 @@ services:
<<: *common-runtime-variables
PORT: ${WORKER_PORT}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_WORKER}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_WORKER}
WORKER_CONCURRENCY: ${WORKER_CONCURRENCY}
logging:
driver: "local"
@@ -290,6 +293,7 @@ services:
NODE_ENV: ${ENVIRONMENT}
LOG_LEVEL: ${LOG_LEVEL}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_PROBE}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_PROBE}
OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT: ${OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT}
OPENTELEMETRY_EXPORTER_OTLP_HEADERS: ${OPENTELEMETRY_EXPORTER_OTLP_HEADERS}
logging:
@@ -314,6 +318,7 @@ services:
NODE_ENV: ${ENVIRONMENT}
LOG_LEVEL: ${LOG_LEVEL}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_PROBE}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_PROBE}
OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT: ${OPENTELEMETRY_EXPORTER_OTLP_ENDPOINT}
OPENTELEMETRY_EXPORTER_OTLP_HEADERS: ${OPENTELEMETRY_EXPORTER_OTLP_HEADERS}
logging:
@@ -329,6 +334,7 @@ services:
AI_AGENT_KEY: ${AI_AGENT_KEY}
ONEUPTIME_URL: ${AI_AGENT_ONEUPTIME_URL}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_AI_AGENT}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_AI_AGENT}
PORT: ${AI_AGENT_PORT}
logging:
driver: "local"
@@ -361,6 +367,7 @@ services:
<<: *common-runtime-variables
PORT: ${TELEMETRY_PORT}
DISABLE_TELEMETRY: ${DISABLE_TELEMETRY_FOR_TELEMETRY}
ENABLE_PROFILING: ${ENABLE_PROFILING_FOR_TELEMETRY}
# Max concurrent telemetry jobs the worker will process
TELEMETRY_CONCURRENCY: ${TELEMETRY_CONCURRENCY}
REGISTER_PROBE_KEY: ${REGISTER_PROBE_KEY}