Merge pull request #2237 from OneUptime/alert-episode

Alert episode
This commit is contained in:
Simon Larsen
2026-01-20 19:08:04 +00:00
committed by GitHub
37 changed files with 8741 additions and 108 deletions

View File

@@ -0,0 +1,548 @@
# Data Models for Alert Deduplication
## Overview
This document defines the database models required for Alert Deduplication functionality.
## Entity Relationship Diagram
```
┌─────────────────────────┐
│ AlertFingerprint │
├─────────────────────────┤
│ id │
│ projectId │
│ fingerprint (hash) │◄──────┐
│ fingerprintFields │ │
│ canonicalAlertId │───────┼──► Alert
│ duplicateCount │ │
│ windowStartAt │ │
│ windowEndAt │ │
└─────────────────────────┘ │
┌─────────────────────────────────┴───────────────────────────────────┐
│ Alert (existing) │
├─────────────────────────────────────────────────────────────────────┤
│ + fingerprint (NEW) - SHA-256 hash of alert │
│ + duplicateCount (NEW) - Number of duplicates suppressed │
│ + lastDuplicateAt (NEW) - When last duplicate occurred │
└─────────────────────────────────────────────────────────────────────┘
```
---
## Model Definitions
### 1. AlertFingerprint
Cache of active fingerprints for deduplication lookups.
**File Location:** `/Common/Models/DatabaseModels/AlertFingerprint.ts`
```typescript
import {
Column,
Entity,
Index,
JoinColumn,
ManyToOne,
} from 'typeorm';
import BaseModel from './DatabaseBaseModel/DatabaseBaseModel';
import Project from './Project';
import Alert from './Alert';
import ObjectID from 'Common/Types/ObjectID';
import ColumnType from 'Common/Types/Database/ColumnType';
import TableColumnType from 'Common/Types/Database/TableColumnType';
import Permission from 'Common/Types/Permission';
import IconProp from 'Common/Types/Icon/IconProp';
@TableMetadata({
tableName: 'AlertFingerprint',
singularName: 'Alert Fingerprint',
pluralName: 'Alert Fingerprints',
icon: IconProp.Key,
tableDescription: 'Stores fingerprints for alert deduplication',
})
@Entity({
name: 'AlertFingerprint',
})
export default class AlertFingerprint extends BaseModel {
// ─────────────────────────────────────────────────────────────
// PROJECT
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Entity,
modelType: Project,
title: 'Project',
})
@ManyToOne(() => Project, {
onDelete: 'CASCADE',
orphanedRowAction: 'delete',
})
@JoinColumn({ name: 'projectId' })
public project?: Project = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.ObjectID,
title: 'Project ID',
})
@Column({
type: ColumnType.ObjectID,
nullable: false,
})
@Index()
public projectId?: ObjectID = undefined;
// ─────────────────────────────────────────────────────────────
// FINGERPRINT
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.ShortText,
title: 'Fingerprint',
description: 'SHA-256 hash of the alert fields',
})
@Column({
type: ColumnType.ShortText,
length: 64, // SHA-256 hex length
nullable: false,
})
@Index()
public fingerprint?: string = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.JSON,
title: 'Fingerprint Fields',
description: 'Fields used to compute this fingerprint',
})
@Column({
type: ColumnType.JSON,
nullable: false,
})
public fingerprintFields?: Array<string> = undefined;
// ─────────────────────────────────────────────────────────────
// CANONICAL ALERT
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Entity,
modelType: Alert,
title: 'Canonical Alert',
description: 'The original alert this fingerprint refers to',
})
@ManyToOne(() => Alert, {
onDelete: 'CASCADE',
orphanedRowAction: 'delete',
})
@JoinColumn({ name: 'canonicalAlertId' })
public canonicalAlert?: Alert = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.ObjectID,
title: 'Canonical Alert ID',
description: 'ID of the original alert',
})
@Column({
type: ColumnType.ObjectID,
nullable: false,
})
@Index()
public canonicalAlertId?: ObjectID = undefined;
// ─────────────────────────────────────────────────────────────
// DUPLICATE TRACKING
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Duplicate Count',
description: 'Number of duplicate alerts suppressed',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public duplicateCount?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Last Duplicate At',
description: 'When the last duplicate was received',
})
@Column({
type: ColumnType.Date,
nullable: true,
})
public lastDuplicateAt?: Date = undefined;
// ─────────────────────────────────────────────────────────────
// TIME WINDOW
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Window Start',
description: 'When this deduplication window started',
})
@Column({
type: ColumnType.Date,
nullable: false,
})
public windowStartAt?: Date = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Window End',
description: 'When this deduplication window expires',
})
@Column({
type: ColumnType.Date,
nullable: false,
})
@Index()
public windowEndAt?: Date = undefined;
}
```
---
### 2. Alert Model Enhancements
Add deduplication fields to existing Alert model.
**File Location:** `/Common/Models/DatabaseModels/Alert.ts` (modifications)
```typescript
// Add these fields to the existing Alert model:
// ─────────────────────────────────────────────────────────────
// DEDUPLICATION FIELDS (NEW)
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.ShortText,
title: 'Fingerprint',
description: 'SHA-256 fingerprint hash for deduplication',
})
@Column({
type: ColumnType.ShortText,
length: 64,
nullable: true,
})
@Index()
public fingerprint?: string = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Duplicate Count',
description: 'Number of duplicate alerts that were suppressed',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public duplicateCount?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Last Duplicate At',
description: 'When the last duplicate occurred',
})
@Column({
type: ColumnType.Date,
nullable: true,
})
public lastDuplicateAt?: Date = undefined;
```
---
### 3. DeduplicationConfig (Project Settings)
Add deduplication settings to Project or create separate settings model.
**Option A: Add to Project model**
```typescript
// In Project model, add:
@ColumnAccessControl({
create: [Permission.ProjectOwner],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@TableColumn({
type: TableColumnType.JSON,
title: 'Deduplication Config',
description: 'Alert deduplication settings for this project',
})
@Column({
type: ColumnType.JSON,
nullable: true,
})
public alertDeduplicationConfig?: DeduplicationConfig = undefined;
```
**Option B: Separate AlertDeduplicationConfig model**
```typescript
@TableMetadata({
tableName: 'AlertDeduplicationConfig',
singularName: 'Deduplication Config',
pluralName: 'Deduplication Configs',
icon: IconProp.Settings,
tableDescription: 'Project-level deduplication settings',
})
@Entity({
name: 'AlertDeduplicationConfig',
})
export default class AlertDeduplicationConfig extends BaseModel {
@ColumnAccessControl({
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@TableColumn({
type: TableColumnType.ObjectID,
title: 'Project ID',
})
@Column({
type: ColumnType.ObjectID,
nullable: false,
})
@Index({ unique: true })
public projectId?: ObjectID = undefined;
@ColumnAccessControl({
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@TableColumn({
type: TableColumnType.Boolean,
title: 'Enabled',
description: 'Whether deduplication is enabled',
})
@Column({
type: ColumnType.Boolean,
nullable: false,
default: true,
})
public enabled?: boolean = undefined;
@ColumnAccessControl({
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Window Minutes',
description: 'Time window for deduplication (minutes)',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 60,
})
public windowMinutes?: number = undefined;
@ColumnAccessControl({
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@TableColumn({
type: TableColumnType.JSON,
title: 'Fingerprint Fields',
description: 'Fields to include in fingerprint',
})
@Column({
type: ColumnType.JSON,
nullable: false,
default: "['monitorId', 'createdCriteriaId', 'alertSeverityId', 'title']",
})
public fingerprintFields?: Array<string> = undefined;
@ColumnAccessControl({
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@TableColumn({
type: TableColumnType.Boolean,
title: 'Normalize Strings',
description: 'Whether to normalize strings (lowercase, trim)',
})
@Column({
type: ColumnType.Boolean,
nullable: false,
default: true,
})
public normalizeStrings?: boolean = undefined;
}
```
---
## Type Definitions
```typescript
// /Common/Types/Alert/DeduplicationConfig.ts
export interface DeduplicationConfig {
// Enable/disable deduplication
enabled: boolean;
// Time window for deduplication (minutes)
windowMinutes: number;
// Fields to include in fingerprint
fingerprintFields: Array<string>;
// Whether to normalize strings (lowercase, trim)
normalizeStrings: boolean;
}
export const DEFAULT_DEDUPLICATION_CONFIG: DeduplicationConfig = {
enabled: true,
windowMinutes: 60,
fingerprintFields: ['monitorId', 'createdCriteriaId', 'alertSeverityId', 'title'],
normalizeStrings: true,
};
export const AVAILABLE_FINGERPRINT_FIELDS: Array<{
field: string;
label: string;
description: string;
}> = [
{
field: 'monitorId',
label: 'Monitor',
description: 'Include monitor in fingerprint',
},
{
field: 'createdCriteriaId',
label: 'Criteria',
description: 'Include alert criteria in fingerprint',
},
{
field: 'alertSeverityId',
label: 'Severity',
description: 'Include severity in fingerprint',
},
{
field: 'title',
label: 'Title',
description: 'Include alert title in fingerprint',
},
{
field: 'description',
label: 'Description',
description: 'Include alert description in fingerprint',
},
];
```
---
## Database Indexes
```sql
-- AlertFingerprint indexes
CREATE INDEX idx_fingerprint_lookup
ON "AlertFingerprint" ("projectId", "fingerprint", "windowEndAt");
CREATE INDEX idx_fingerprint_cleanup
ON "AlertFingerprint" ("windowEndAt");
CREATE INDEX idx_fingerprint_alert
ON "AlertFingerprint" ("canonicalAlertId");
-- Alert fingerprint index
CREATE INDEX idx_alert_fingerprint
ON "Alert" ("projectId", "fingerprint")
WHERE "fingerprint" IS NOT NULL;
```
---
## Implementation Checklist
- [ ] Create AlertFingerprint model
- [ ] Add fingerprint fields to Alert model
- [ ] Create DeduplicationConfig type
- [ ] Add config to Project model (or create separate model)
- [ ] Register models in model registry
- [ ] Create database migrations
- [ ] Add indexes
- [ ] Update API permissions

View File

@@ -0,0 +1,667 @@
# Backend Implementation for Alert Deduplication
## Overview
This document details the backend services and components required for Alert Deduplication functionality.
## Core Components
### 1. FingerprintGenerator
Generates unique fingerprints for alerts based on configurable fields.
**File Location:** `/Common/Server/Utils/Alert/FingerprintGenerator.ts`
```typescript
import Alert from '../../Models/DatabaseModels/Alert';
import crypto from 'crypto';
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
export default class FingerprintGenerator {
/**
* Default fields used for fingerprinting
*/
public static DEFAULT_FIELDS: Array<string> = [
'monitorId',
'createdCriteriaId',
'alertSeverityId',
'title',
];
/**
* Generate a fingerprint hash for an alert
*/
public static generate(
alert: Partial<Alert>,
config?: Partial<DeduplicationConfig>
): string {
const fields = config?.fingerprintFields || this.DEFAULT_FIELDS;
const normalizeStrings = config?.normalizeStrings ?? true;
const values: Array<string> = [];
for (const field of fields) {
let value = this.getFieldValue(alert, field);
if (normalizeStrings && typeof value === 'string') {
value = value.toLowerCase().trim();
}
values.push(`${field}:${value}`);
}
const fingerprintInput = values.join('|');
return crypto
.createHash('sha256')
.update(fingerprintInput)
.digest('hex');
}
/**
* Get a field value from an alert object
*/
private static getFieldValue(alert: Partial<Alert>, field: string): string {
switch (field) {
case 'monitorId':
return alert.monitorId?.toString() || '';
case 'createdCriteriaId':
return alert.createdCriteriaId || '';
case 'alertSeverityId':
case 'severity':
return alert.alertSeverityId?.toString() || '';
case 'title':
return alert.title || '';
case 'description':
return alert.description || '';
case 'createdByProbeId':
return alert.createdByProbeId?.toString() || '';
default:
// Try to get from customFields
if (alert.customFields && typeof alert.customFields === 'object') {
const customValue = (alert.customFields as Record<string, unknown>)[field];
return customValue?.toString() || '';
}
return '';
}
}
/**
* Validate that all required fields are present for fingerprinting
*/
public static validateFields(
alert: Partial<Alert>,
fields: Array<string>
): { valid: boolean; missingFields: Array<string> } {
const missingFields: Array<string> = [];
for (const field of fields) {
const value = this.getFieldValue(alert, field);
if (!value) {
missingFields.push(field);
}
}
return {
valid: missingFields.length === 0,
missingFields,
};
}
/**
* Compare two fingerprints
*/
public static areEqual(fingerprint1: string, fingerprint2: string): boolean {
return fingerprint1 === fingerprint2;
}
}
```
---
### 2. DeduplicationEngine
Handles the core deduplication logic.
**File Location:** `/Common/Server/Utils/Alert/DeduplicationEngine.ts`
```typescript
import Alert from '../../Models/DatabaseModels/Alert';
import AlertFingerprint from '../../Models/DatabaseModels/AlertFingerprint';
import AlertFingerprintService from '../../Services/AlertFingerprintService';
import AlertService from '../../Services/AlertService';
import FingerprintGenerator from './FingerprintGenerator';
import ObjectID from 'Common/Types/ObjectID';
import OneUptimeDate from 'Common/Types/Date';
import QueryHelper from '../../Types/Database/QueryHelper';
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
export interface DeduplicationResult {
isDuplicate: boolean;
canonicalAlertId?: ObjectID;
canonicalAlert?: Alert;
duplicateCount?: number;
fingerprint: string;
}
export default class DeduplicationEngine {
/**
* Check if an alert is a duplicate of an existing alert
*/
public static async checkDuplicate(
alertData: Partial<Alert>,
projectId: ObjectID,
config?: Partial<DeduplicationConfig>
): Promise<DeduplicationResult> {
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
// Generate fingerprint
const fingerprint = FingerprintGenerator.generate(alertData, mergedConfig);
// Check if fingerprint exists in active window
const existingFingerprint = await AlertFingerprintService.findOneBy({
query: {
projectId,
fingerprint,
windowEndAt: QueryHelper.greaterThan(new Date()),
},
select: {
_id: true,
canonicalAlertId: true,
duplicateCount: true,
},
props: { isRoot: true },
});
if (existingFingerprint) {
// It's a duplicate - update counters
const newDuplicateCount = (existingFingerprint.duplicateCount || 0) + 1;
await AlertFingerprintService.updateOneById({
id: existingFingerprint.id!,
data: {
duplicateCount: newDuplicateCount,
lastDuplicateAt: new Date(),
},
props: { isRoot: true },
});
// Update the canonical alert's duplicate count
await AlertService.updateOneById({
id: existingFingerprint.canonicalAlertId!,
data: {
duplicateCount: newDuplicateCount,
lastDuplicateAt: new Date(),
},
props: { isRoot: true },
});
// Get the canonical alert for return
const canonicalAlert = await AlertService.findOneById({
id: existingFingerprint.canonicalAlertId!,
select: {
_id: true,
title: true,
alertNumber: true,
},
props: { isRoot: true },
});
return {
isDuplicate: true,
canonicalAlertId: existingFingerprint.canonicalAlertId,
canonicalAlert: canonicalAlert || undefined,
duplicateCount: newDuplicateCount,
fingerprint,
};
}
// Not a duplicate
return {
isDuplicate: false,
fingerprint,
};
}
/**
* Register a new fingerprint for an alert
*/
public static async registerFingerprint(
alert: Alert,
config?: Partial<DeduplicationConfig>
): Promise<AlertFingerprint> {
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
const fingerprint = alert.fingerprint ||
FingerprintGenerator.generate(alert, mergedConfig);
const now = new Date();
const windowEnd = OneUptimeDate.addRemoveMinutes(
now,
mergedConfig.windowMinutes
);
const fingerprintRecord = await AlertFingerprintService.create({
data: {
projectId: alert.projectId,
fingerprint,
fingerprintFields: mergedConfig.fingerprintFields,
canonicalAlertId: alert.id,
duplicateCount: 0,
windowStartAt: now,
windowEndAt: windowEnd,
} as AlertFingerprint,
props: { isRoot: true },
});
return fingerprintRecord;
}
/**
* Process an alert through deduplication
* Returns the alert to create (or null if duplicate)
*/
public static async processAlert(
alertData: Partial<Alert>,
projectId: ObjectID,
config?: Partial<DeduplicationConfig>
): Promise<{
shouldCreate: boolean;
alertData: Partial<Alert>;
deduplicationResult: DeduplicationResult;
}> {
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
// Skip deduplication if disabled
if (!mergedConfig.enabled) {
const fingerprint = FingerprintGenerator.generate(alertData, mergedConfig);
return {
shouldCreate: true,
alertData: { ...alertData, fingerprint },
deduplicationResult: {
isDuplicate: false,
fingerprint,
},
};
}
// Check for duplicate
const result = await this.checkDuplicate(alertData, projectId, mergedConfig);
if (result.isDuplicate) {
return {
shouldCreate: false,
alertData,
deduplicationResult: result,
};
}
// Not a duplicate - add fingerprint to alert data
return {
shouldCreate: true,
alertData: { ...alertData, fingerprint: result.fingerprint },
deduplicationResult: result,
};
}
/**
* Get deduplication statistics for a project
*/
public static async getStatistics(
projectId: ObjectID,
startDate: Date,
endDate: Date
): Promise<{
totalAlerts: number;
uniqueAlerts: number;
duplicateCount: number;
deduplicationRate: number;
}> {
// Count total fingerprint records
const fingerprints = await AlertFingerprintService.findBy({
query: {
projectId,
windowStartAt: QueryHelper.between(startDate, endDate),
},
select: {
_id: true,
duplicateCount: true,
},
props: { isRoot: true },
});
const uniqueAlerts = fingerprints.length;
const duplicateCount = fingerprints.reduce(
(sum, fp) => sum + (fp.duplicateCount || 0),
0
);
const totalAlerts = uniqueAlerts + duplicateCount;
const deduplicationRate = totalAlerts > 0
? (duplicateCount / totalAlerts) * 100
: 0;
return {
totalAlerts,
uniqueAlerts,
duplicateCount,
deduplicationRate: Math.round(deduplicationRate * 100) / 100,
};
}
}
```
---
### 3. AlertFingerprintService
Database service for AlertFingerprint model.
**File Location:** `/Common/Server/Services/AlertFingerprintService.ts`
```typescript
import DatabaseService from './DatabaseService';
import AlertFingerprint from '../Models/DatabaseModels/AlertFingerprint';
import ObjectID from 'Common/Types/ObjectID';
import QueryHelper from '../Types/Database/QueryHelper';
export class Service extends DatabaseService<AlertFingerprint> {
public constructor() {
super(AlertFingerprint);
}
/**
* Clean up expired fingerprints
*/
public async cleanupExpired(): Promise<number> {
const result = await this.deleteBy({
query: {
windowEndAt: QueryHelper.lessThan(new Date()),
},
props: { isRoot: true },
});
return result;
}
/**
* Get active fingerprints for a project
*/
public async getActiveFingerprints(
projectId: ObjectID
): Promise<Array<AlertFingerprint>> {
return await this.findBy({
query: {
projectId,
windowEndAt: QueryHelper.greaterThan(new Date()),
},
select: {
_id: true,
fingerprint: true,
canonicalAlertId: true,
duplicateCount: true,
windowEndAt: true,
},
props: { isRoot: true },
});
}
/**
* Extend the window for a fingerprint (if alert is still active)
*/
public async extendWindow(
fingerprintId: ObjectID,
newEndTime: Date
): Promise<void> {
await this.updateOneById({
id: fingerprintId,
data: {
windowEndAt: newEndTime,
},
props: { isRoot: true },
});
}
}
export default new Service();
```
---
### 4. Integration with AlertService
Modify AlertService to use deduplication.
**File Location:** `/Common/Server/Services/AlertService.ts` (modifications)
```typescript
import DeduplicationEngine from '../Utils/Alert/DeduplicationEngine';
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
// In onBeforeCreate():
protected async onBeforeCreate(
createBy: CreateBy<Alert>
): Promise<OnCreate<Alert>> {
// ... existing code ...
// Get deduplication config for project
const deduplicationConfig = await this.getDeduplicationConfig(
createBy.data.projectId!
);
// Process through deduplication engine
const deduplicationResult = await DeduplicationEngine.processAlert(
createBy.data,
createBy.data.projectId!,
deduplicationConfig
);
if (!deduplicationResult.shouldCreate) {
// This is a duplicate - don't create
throw new DuplicateAlertException(
`Duplicate of alert #${deduplicationResult.deduplicationResult.canonicalAlert?.alertNumber}`,
deduplicationResult.deduplicationResult.canonicalAlertId!
);
}
// Add fingerprint to alert data
createBy.data.fingerprint = deduplicationResult.alertData.fingerprint;
// ... rest of existing code ...
}
// In onCreateSuccess():
protected async onCreateSuccess(
onCreate: OnCreate<Alert>,
createdItem: Alert
): Promise<Alert> {
// ... existing code ...
// Register fingerprint for deduplication
const deduplicationConfig = await this.getDeduplicationConfig(
createdItem.projectId!
);
if (deduplicationConfig.enabled) {
await DeduplicationEngine.registerFingerprint(
createdItem,
deduplicationConfig
);
}
// ... rest of existing code ...
}
// Helper method:
private async getDeduplicationConfig(
projectId: ObjectID
): Promise<DeduplicationConfig> {
const project = await ProjectService.findOneById({
id: projectId,
select: { alertDeduplicationConfig: true },
props: { isRoot: true },
});
return project?.alertDeduplicationConfig || DEFAULT_DEDUPLICATION_CONFIG;
}
```
---
### 5. DuplicateAlertException
Custom exception for duplicate alerts.
**File Location:** `/Common/Types/Exception/DuplicateAlertException.ts`
```typescript
import Exception from './Exception';
import ExceptionCode from './ExceptionCode';
import ObjectID from '../ObjectID';
export default class DuplicateAlertException extends Exception {
public canonicalAlertId: ObjectID;
public constructor(message: string, canonicalAlertId: ObjectID) {
super(ExceptionCode.DuplicateAlertException, message);
this.canonicalAlertId = canonicalAlertId;
}
}
```
---
## Worker Jobs
### 1. FingerprintCleanup Job
**File Location:** `/Worker/Jobs/AlertDeduplication/FingerprintCleanup.ts`
```typescript
import RunCron from '../../Utils/Cron';
import { EVERY_HOUR } from 'Common/Utils/CronTime';
import AlertFingerprintService from 'Common/Server/Services/AlertFingerprintService';
RunCron(
'AlertDeduplication:FingerprintCleanup',
{ schedule: EVERY_HOUR, runOnStartup: false },
async () => {
const deletedCount = await AlertFingerprintService.cleanupExpired();
if (deletedCount > 0) {
logger.info(`Cleaned up ${deletedCount} expired fingerprints`);
}
}
);
```
---
## Redis Caching (Optional Enhancement)
For high-throughput systems, cache fingerprints in Redis.
**File Location:** `/Common/Server/Utils/Alert/FingerprintCache.ts`
```typescript
import Redis from '../../Infrastructure/Redis';
import ObjectID from 'Common/Types/ObjectID';
export default class FingerprintCache {
private static CACHE_PREFIX = 'alert:fingerprint:';
private static DEFAULT_TTL_SECONDS = 3600; // 1 hour
/**
* Get a cached fingerprint
*/
public static async get(
projectId: ObjectID,
fingerprint: string
): Promise<{ canonicalAlertId: string; duplicateCount: number } | null> {
const key = this.buildKey(projectId, fingerprint);
const value = await Redis.get(key);
if (!value) {
return null;
}
return JSON.parse(value);
}
/**
* Set a fingerprint in cache
*/
public static async set(
projectId: ObjectID,
fingerprint: string,
data: { canonicalAlertId: string; duplicateCount: number },
ttlSeconds: number = this.DEFAULT_TTL_SECONDS
): Promise<void> {
const key = this.buildKey(projectId, fingerprint);
await Redis.setex(key, ttlSeconds, JSON.stringify(data));
}
/**
* Increment duplicate count in cache
*/
public static async incrementDuplicateCount(
projectId: ObjectID,
fingerprint: string
): Promise<number> {
const key = this.buildKey(projectId, fingerprint);
const countKey = `${key}:count`;
return await Redis.incr(countKey);
}
/**
* Delete a fingerprint from cache
*/
public static async delete(
projectId: ObjectID,
fingerprint: string
): Promise<void> {
const key = this.buildKey(projectId, fingerprint);
await Redis.del(key);
}
private static buildKey(projectId: ObjectID, fingerprint: string): string {
return `${this.CACHE_PREFIX}${projectId.toString()}:${fingerprint}`;
}
}
```
---
## Implementation Checklist
### Phase 1: Core Components
- [ ] Create FingerprintGenerator utility
- [ ] Create DeduplicationEngine
- [ ] Create AlertFingerprintService
- [ ] Create DuplicateAlertException
### Phase 2: Integration
- [ ] Modify AlertService.onBeforeCreate()
- [ ] Modify AlertService.onCreateSuccess()
- [ ] Add fingerprint fields to Alert model
- [ ] Create AlertFingerprint model
### Phase 3: Background Jobs
- [ ] Create FingerprintCleanup job
- [ ] Register job in worker
### Phase 4: Testing
- [ ] Unit tests for FingerprintGenerator
- [ ] Unit tests for DeduplicationEngine
- [ ] Integration tests for deduplication flow
- [ ] Performance tests for high-volume scenarios
### Phase 5: Optional Enhancements
- [ ] Redis caching for fingerprints
- [ ] Configurable fingerprint fields per project
- [ ] Deduplication analytics API

View File

@@ -0,0 +1,287 @@
# API Design for Alert Deduplication
## Overview
This document defines the REST API endpoints for Alert Deduplication functionality.
## Deduplication Configuration API
### Get Deduplication Config
```http
GET /api/project/{projectId}/alert-deduplication-config
```
**Response:**
```json
{
"enabled": true,
"windowMinutes": 60,
"fingerprintFields": ["monitorId", "createdCriteriaId", "alertSeverityId", "title"],
"normalizeStrings": true
}
```
### Update Deduplication Config
```http
PUT /api/project/{projectId}/alert-deduplication-config
```
**Request Body:**
```json
{
"enabled": true,
"windowMinutes": 120,
"fingerprintFields": ["monitorId", "alertSeverityId", "title"],
"normalizeStrings": true
}
```
---
## Deduplication Statistics API
### Get Deduplication Statistics
```http
GET /api/project/{projectId}/alert-deduplication-stats
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `startDate` | Date | Start of period |
| `endDate` | Date | End of period |
**Response:**
```json
{
"period": {
"startDate": "2026-01-13T00:00:00Z",
"endDate": "2026-01-20T00:00:00Z"
},
"totalAlerts": 5000,
"uniqueAlerts": 2500,
"duplicateCount": 2500,
"deduplicationRate": 50.0,
"topDuplicatedAlerts": [
{
"alertId": "alert-1",
"alertTitle": "MySQL connection timeout",
"duplicateCount": 150,
"monitor": { "name": "mysql-prod" }
},
{
"alertId": "alert-2",
"alertTitle": "API latency high",
"duplicateCount": 89,
"monitor": { "name": "api-gateway" }
}
]
}
```
---
## Alert Fingerprint API
### List Active Fingerprints
```http
GET /api/project/{projectId}/alert-fingerprint
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `limit` | number | Results per page |
| `skip` | number | Pagination offset |
**Response:**
```json
{
"data": [
{
"_id": "fingerprint-1",
"fingerprint": "a1b2c3d4...",
"canonicalAlert": {
"_id": "alert-1",
"alertNumber": 123,
"title": "MySQL connection timeout"
},
"duplicateCount": 15,
"lastDuplicateAt": "2026-01-20T10:45:00Z",
"windowEndAt": "2026-01-20T11:00:00Z"
}
],
"count": 50
}
```
### Get Fingerprint Details
```http
GET /api/project/{projectId}/alert-fingerprint/{fingerprintId}
```
---
## Alert Response Enhancement
The Alert response now includes deduplication fields:
```json
{
"_id": "alert-1",
"alertNumber": 123,
"title": "MySQL connection timeout",
"fingerprint": "a1b2c3d4e5f6...",
"duplicateCount": 15,
"lastDuplicateAt": "2026-01-20T10:45:00Z",
"// ... other fields"
}
```
### Filter Alerts by Duplicate Count
```http
GET /api/project/{projectId}/alert?duplicateCount.gt=10
```
Get alerts with more than 10 duplicates.
---
## Available Fingerprint Fields API
### Get Available Fields
```http
GET /api/alert-deduplication-config/available-fields
```
**Response:**
```json
{
"fields": [
{
"field": "monitorId",
"label": "Monitor",
"description": "Include monitor in fingerprint"
},
{
"field": "createdCriteriaId",
"label": "Criteria",
"description": "Include alert criteria in fingerprint"
},
{
"field": "alertSeverityId",
"label": "Severity",
"description": "Include severity in fingerprint"
},
{
"field": "title",
"label": "Title",
"description": "Include alert title in fingerprint"
},
{
"field": "description",
"label": "Description",
"description": "Include alert description in fingerprint"
}
]
}
```
---
## Test Fingerprint API
### Generate Test Fingerprint
Test what fingerprint would be generated for given alert data.
```http
POST /api/project/{projectId}/alert-deduplication-config/test
```
**Request Body:**
```json
{
"alertData": {
"monitorId": "monitor-1",
"alertSeverityId": "severity-1",
"title": "MySQL connection timeout"
}
}
```
**Response:**
```json
{
"fingerprint": "a1b2c3d4e5f6...",
"fieldsUsed": ["monitorId", "alertSeverityId", "title"],
"fieldValues": {
"monitorId": "monitor-1",
"alertSeverityId": "severity-1",
"title": "mysql connection timeout"
},
"wouldBeDuplicateOf": {
"alertId": "alert-123",
"alertNumber": 123,
"alertTitle": "MySQL connection timeout"
}
}
```
---
## Error Responses
```json
{
"error": {
"code": "DUPLICATE_ALERT",
"message": "Duplicate of alert #123",
"data": {
"canonicalAlertId": "alert-123",
"canonicalAlertNumber": 123,
"duplicateCount": 16
}
}
}
```
Note: This is typically not shown to users as duplicates are handled silently.
---
## Implementation Checklist
### Configuration API
- [ ] GET /alert-deduplication-config
- [ ] PUT /alert-deduplication-config
- [ ] GET /alert-deduplication-config/available-fields
- [ ] POST /alert-deduplication-config/test
### Statistics API
- [ ] GET /alert-deduplication-stats
### Fingerprint API
- [ ] GET /alert-fingerprint (list)
- [ ] GET /alert-fingerprint/:id (details)
### Alert API Updates
- [ ] Add fingerprint to response
- [ ] Add duplicateCount to response
- [ ] Add duplicateCount filter

View File

@@ -0,0 +1,259 @@
# UI Implementation for Alert Deduplication
## Overview
This document details the frontend components and pages required for Alert Deduplication functionality.
## Navigation Structure
```
Dashboard
└── Settings
└── Alerts
├── Alert States (existing)
├── Alert Severities (existing)
├── Grouping Rules
├── Suppression Rules
└── Deduplication (NEW)
```
---
## Pages to Create
### 1. Deduplication Settings Page
**File Location:** `/Dashboard/src/Pages/Settings/AlertDeduplication.tsx`
**Route:** `/dashboard/:projectId/settings/alert-deduplication`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Settings > Alert Deduplication │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Deduplication prevents duplicate alerts from being created. When a duplicate │ │
│ │ alert is detected, it increments the count on the original alert instead. │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ DEDUPLICATION STATUS │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌──────────────────────────────────────────────────────────────────────┐ │
│ │ ✅ Deduplication is ENABLED [Disable] │ │
│ └──────────────────────────────────────────────────────────────────────┘ │
│ │
│ STATISTICS (Last 7 Days) │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ Total Alerts │ │ Unique Alerts │ │ Deduplicated │ │
│ │ │ │ │ │ │ │
│ │ 5,000 │ │ 2,500 │ │ 2,500 │ │
│ │ │ │ │ │ (50%) │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
│ │
│ CONFIGURATION │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ Deduplication Window │
│ ┌──────────┐ │
│ │ 60 │ minutes │
│ └──────────┘ │
│ Alerts with the same fingerprint within this window are considered duplicates. │
│ │
│ Fingerprint Fields │
│ Select which fields to include when computing the alert fingerprint: │
│ │
│ ☑ Monitor - Include monitor in fingerprint │
│ ☑ Criteria - Include alert criteria in fingerprint │
│ ☑ Severity - Include severity level in fingerprint │
│ ☑ Title - Include alert title in fingerprint │
│ ☐ Description - Include alert description in fingerprint │
│ │
│ String Normalization │
│ ☑ Normalize strings (convert to lowercase, trim whitespace) │
│ │
│ [Save Changes] │
│ │
│ TEST FINGERPRINT │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ Test what fingerprint would be generated for an alert: │
│ │
│ Monitor Severity Title │
│ ┌───────────────┐ ┌───────────────┐ ┌─────────────────────────────────────────┐ │
│ │ mysql-prod [▼]│ │ Critical [▼] │ │ Connection timeout │ │
│ └───────────────┘ └───────────────┘ └─────────────────────────────────────────┘ │
│ │
│ [Generate Fingerprint] │
│ │
│ Result: │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Fingerprint: a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6 │ │
│ │ This would be a DUPLICATE of Alert #123: "MySQL connection timeout" │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 2. Alert Detail Enhancement
Add deduplication info to Alert detail page.
**Wireframe Addition:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Alert #123: MySQL connection timeout │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────────────────────────────────────┐ │
│ │ DEDUPLICATION INFO │ │
│ │ ────────────────────────────────────────── │ │
│ │ │ │
│ │ 🔢 Duplicate Count: 15 │ │
│ │ This alert represents 16 total │ │
│ │ occurrences (1 original + 15 dupes) │ │
│ │ │ │
│ │ 🕐 Last Duplicate: 10 minutes ago │ │
│ │ │ │
│ │ 🔑 Fingerprint: │ │
│ │ a1b2c3d4e5f6... [Copy]│ │
│ └──────────────────────────────────────────────┘ │
│ │
│ // ... rest of alert details │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 3. Alerts Table Enhancement
Add duplicate count column to alerts table.
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Alerts │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌───────┬──────────────────────────────────┬──────────┬───────┬──────┬────────────┐│
│ │ ID │ Title │ Severity │ Dupes │ State│ Age ││
│ ├───────┼──────────────────────────────────┼──────────┼───────┼──────┼────────────┤│
│ │ #127 │ MySQL connection timeout │ Critical │ x15 │ ● │ 2m ││
│ │ #126 │ Disk space low │ Warning │ x3 │ ● │ 15m ││
│ │ #125 │ API response slow │ High │ — │ ✓ │ 1h ││
│ │ #124 │ Memory usage high │ Warning │ x47 │ ● │ 2h ││
│ └───────┴──────────────────────────────────┴──────────┴───────┴──────┴────────────┘│
│ │
│ Dupes = Number of duplicate alerts suppressed │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
## Components to Create
### 1. DeduplicationStatsCard
**File:** `/Dashboard/src/Components/Deduplication/DeduplicationStatsCard.tsx`
Shows deduplication statistics in a card format.
```typescript
interface DeduplicationStatsCardProps {
totalAlerts: number;
uniqueAlerts: number;
duplicateCount: number;
deduplicationRate: number;
}
```
### 2. FingerprintFieldSelector
**File:** `/Dashboard/src/Components/Deduplication/FingerprintFieldSelector.tsx`
Checkbox list for selecting fingerprint fields.
```typescript
interface FingerprintFieldSelectorProps {
selectedFields: Array<string>;
onChange: (fields: Array<string>) => void;
availableFields: Array<{
field: string;
label: string;
description: string;
}>;
}
```
### 3. FingerprintTester
**File:** `/Dashboard/src/Components/Deduplication/FingerprintTester.tsx`
Form for testing fingerprint generation.
### 4. DuplicateCountBadge
**File:** `/Dashboard/src/Components/Deduplication/DuplicateCountBadge.tsx`
Badge showing duplicate count.
```typescript
interface DuplicateCountBadgeProps {
count: number;
showIfZero?: boolean;
}
```
### 5. DeduplicationInfoCard
**File:** `/Dashboard/src/Components/Deduplication/DeduplicationInfoCard.tsx`
Card for alert detail page showing deduplication info.
---
## Routing Configuration
Add to route configuration:
```typescript
{
path: '/dashboard/:projectId/settings/alert-deduplication',
component: AlertDeduplicationPage,
}
```
---
## Implementation Checklist
### Pages
- [ ] Deduplication settings page
### Components
- [ ] DeduplicationStatsCard
- [ ] FingerprintFieldSelector
- [ ] FingerprintTester
- [ ] DuplicateCountBadge
- [ ] DeduplicationInfoCard
### Existing Page Updates
- [ ] Add duplicate count column to Alerts table
- [ ] Add deduplication info to Alert detail page
- [ ] Add sidebar navigation item
### Styling
- [ ] Stats card styles
- [ ] Badge styles
- [ ] Field selector styles

View File

@@ -0,0 +1,165 @@
# Alert Deduplication Implementation Plan
## Overview
This sub-plan details the implementation of Alert Deduplication and Fingerprinting functionality for OneUptime. This feature prevents duplicate alerts from being created and tracks duplicate occurrences.
## Documents
| Document | Description |
|----------|-------------|
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
| [2-Backend.md](./2-Backend.md) | Backend services and deduplication engine |
| [3-API.md](./3-API.md) | REST API endpoints |
| [4-UI.md](./4-UI.md) | Frontend components and pages |
## Feature Summary
### What is Alert Deduplication?
Alert Deduplication prevents the same alert from being created multiple times within a configurable time window. Instead of creating duplicate alerts, the system increments a counter on the original alert.
### How Fingerprinting Works
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ Alert Fingerprint Generation │
└─────────────────────────────────────────────────────────────────────────────────┘
Alert Data Fingerprint Fields Hash
┌─────────────────┐ ┌───────────────────┐ ┌────────────┐
│ monitorId: abc │ │ monitorId: abc │ │ │
│ criteriaId: xyz │ ──► │ criteriaId: xyz │ ──► │ SHA-256 │
│ severity: high │ │ severity: high │ │ = a1b2c3.. │
│ title: "Error" │ │ title: "Error" │ │ │
│ time: 10:00 AM │ │ (time excluded) │ │ │
└─────────────────┘ └───────────────────┘ └────────────┘
```
### Key Capabilities
1. **Fingerprint Generation** - Compute unique hash from alert fields
2. **Time-Window Deduplication** - Deduplicate within configurable window
3. **Duplicate Counting** - Track how many duplicates were suppressed
4. **Configurable Fields** - Choose which fields to include in fingerprint
5. **Per-Project Settings** - Customize deduplication per project
### Benefits
| Without Deduplication | With Deduplication |
|-----------------------|-------------------|
| 100 identical alerts created | 1 alert with count: 100 |
| 100 notifications sent | 1 notification sent |
| Alert fatigue | Reduced noise |
| Storage waste | Efficient storage |
### User Stories
```
As an operator, I want duplicate alerts to be automatically merged
so that I don't see the same alert repeated 50 times.
As a team lead, I want to know how many times an alert occurred
so that I can understand the severity of the issue.
As an SRE, I want to configure the deduplication window
so that I can tune it for my team's workflow.
```
## Implementation Phases
### Phase 1: Core Fingerprinting (Week 1)
- [ ] Create FingerprintGenerator utility
- [ ] Add fingerprint field to Alert model
- [ ] Implement basic SHA-256 fingerprinting
- [ ] Add duplicate count field to Alert
### Phase 2: Deduplication Engine (Week 2)
- [ ] Create AlertFingerprint cache model
- [ ] Implement DeduplicationEngine
- [ ] Integrate with AlertService
- [ ] Add time-window support
### Phase 3: Configuration & UI (Week 3)
- [ ] Add project-level deduplication settings
- [ ] Create deduplication configuration UI
- [ ] Add duplicate count to Alert detail view
- [ ] Add deduplication metrics
### Phase 4: Advanced Features (Week 4)
- [ ] Configurable fingerprint fields
- [ ] Redis caching for fingerprints
- [ ] Deduplication analytics dashboard
## Architecture
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ Deduplication Flow │
└─────────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────┐
│ Alert Trigger │
└──────────┬───────────┘
┌──────────────────────┐
│ FingerprintGenerator │
│ .generate() │
└──────────┬───────────┘
┌──────────────────────┐
│ DeduplicationEngine │
│ .checkDuplicate() │
└──────────┬───────────┘
┌────────────────┴────────────────┐
│ │
▼ ▼
┌─────────────────┐ ┌─────────────────┐
│ DUPLICATE │ │ NEW │
│ │ │ │
│ - Increment │ │ - Create alert │
│ count on │ │ - Register │
│ original │ │ fingerprint │
│ - Skip creation │ │ - Send notifs │
└─────────────────┘ └─────────────────┘
```
## Configuration Options
```typescript
interface DeduplicationConfig {
// Enable/disable deduplication
enabled: boolean;
// Time window for deduplication (minutes)
windowMinutes: number; // Default: 60
// Fields to include in fingerprint
fingerprintFields: Array<string>; // Default: ['monitorId', 'criteriaId', 'severity', 'title']
// Whether to normalize strings (lowercase, trim)
normalizeStrings: boolean; // Default: true
}
```
## Success Metrics
| Metric | Target |
|--------|--------|
| Duplicate detection accuracy | > 99% |
| Fingerprint generation time | < 5ms |
| Storage reduction | 30-50% |
| Notification reduction | 40-60% |
## References
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
- [Alert Grouping Plan](../AlertGrouping/README.md)
- [Alert Suppression Plan](../AlertSuppression/README.md)

View File

@@ -0,0 +1,319 @@
# Alert Grouping / Episodes - Summary
## What is Alert Grouping?
Alert Grouping is a feature that automatically combines related alerts into logical containers called **Episodes**. Instead of seeing 50 individual "connection timeout" alerts, operators see one episode: "Database Connectivity Issues (50 alerts)".
## Key Capabilities
1. **Automatic Grouping** - Rules-based grouping of alerts into episodes
2. **Time-Window Grouping** - Group alerts occurring within N minutes
3. **Field-Based Grouping** - Group by monitor, monitor custom fields, severity, labels, etc.
4. **Manual Management** - Add/remove alerts from episodes (merge/split deferred to future)
5. **Episode Lifecycle** - Active → Acknowledged → Resolved states. These should be linked to alert states.
6. **Root Cause Tracking** - Document root cause analysis per episode. This is a placeholder field for user to fill out. We can even use Generate with AI to help summarize the episode based on Root Cause of all the alerts in the episode.
7. **Flapping Prevention** - Grace periods before resolution and reopen windows
## Data Models
### Three New Models
| Model | Purpose |
|-------|---------|
| **AlertEpisode** | Container for grouped alerts (title, state, severity, timing, ownership) |
| **AlertEpisodeMember** | Links alerts to episodes with metadata (addedBy, addedAt) |
| **AlertGroupingRule** | Configures automatic grouping behavior (match criteria, grouping config, priority) |
### Alert Model Enhancements
- `episodeId` - Link to parent episode (nullable)
## Grouping Types
| Type | Description |
|------|-------------|
| **Time Window** | Groups alerts within N minutes of each other |
| **Field-Based** | Groups by matching fields (monitor, severity, labels) |
| **Smart** | ML-based similarity matching (future) |
## Flapping Prevention
- **resolveDelayMinutes** - Grace period before auto-resolving (prevents rapid state changes)
- **reopenWindowMinutes** - Window after resolution where episode can be reopened instead of creating new
## On-Call Policy Resolution
Priority chain for notifications:
1. Grouping rule's is linked to on-call policy. When episode is created via a grouping rule, that rule's on-call policy is used.
2. If alert has any on-call policy. Please use it as well along with grouping rule's on-call policy.
3. If neither the grouping rule nor alert has an on-call policy, no notifications are sent.
When an alert joins an episode, the alert policy (if any) is executed as normal. The episode's on-call policy is also executed. This means that if an alert has an on-call policy, notifications may be sent twice - once for the alert and once for the episode. If the episode policy is executed and then a new alert joins the episode, the episode's on-call policy is NOT re-executed.
### Worker Jobs
- **EpisodeAutoResolve** - Resolves episodes when all alerts resolved
- **EpisodeBreakInactive** - Resolves episodes after inactivity period
## Database Migrations
Please do not write Database migrations. I will do that manually.
---
## Implementation Q&A (Industry Best Practices)
### Episode State Management
**Q1: How should episode states link to alert states?**
The episode state should reflect the "worst" or most urgent state among its member alerts:
- **Active**: At least one alert in the episode is in an active/firing state
- **Acknowledged**: All active alerts have been acknowledged, but not yet resolved
- **Resolved**: All alerts in the episode are resolved
This follows the pattern used by PagerDuty, Opsgenie, and other incident management platforms. The episode acts as an aggregate - it's only fully resolved when all underlying alerts are resolved.
If I Acknowledge an episode for exmaple, all active alerts in that episode should also be acknowledged. This ensures consistency between episode and alert states.
**Q2: If a new alert joins an already-acknowledged episode, should the episode state change back to Active?**
**No** - the acknowledgment applies to the episode as a container, not to individual alerts. When a new alert joins an acknowledged episode:
- The episode remains in "Acknowledged" state
- The new alert is marked as part of an acknowledged episode
- No new notification is sent for the episode (since it's already acknowledged)
- The alert's own on-call policy still executes normally
This prevents notification fatigue while ensuring the operator knows the episode is still being worked on.
---
### Grouping Logic
**Q3: For Time Window grouping - if an alert comes in after the initial window, should it create a new episode or join the existing one?**
Use a **rolling/sliding window** approach:
- The time window refers to the **gap between consecutive alerts**, not from the first alert
- If an episode is still **Active** and a matching alert arrives, it joins the episode regardless of when the first alert occurred
- The time window is used to determine when an episode becomes "inactive" (no new alerts for N minutes)
- Example: With a 10-minute window, alerts at T+0, T+8, T+15, T+22 would all be in the same episode (each gap < 10 min)
This is the standard approach in tools like PagerDuty's Intelligent Grouping and Opsgenie's Alert Deduplication.
**Q4: What fields can be matched in Field-Based grouping?**
Standard matchable fields should include:
| Field | Description |
|-------|-------------|
| `monitorId` | Same monitor/service |
| `monitorCustomFields` | User-defined monitor metadata |
| `alertSeverity` | Critical, Warning, Info, etc. |
| `labels` | Key-value tags on alerts |
| `alertTitle` | Exact or pattern match on title |
| `alertDescription` | Pattern match on description |
| `telemetryQuery` | The query that triggered the alert |
Rules should support both exact matching and regex patterns for string fields.
**Q5: If multiple AlertGroupingRules match a single alert, which rule takes priority?**
Use explicit **priority ordering**:
- Each rule has a `priority` field (lower number = higher priority)
- The **first matching rule** (highest priority) wins
- Only one rule processes each alert
- If no rules match, the alert remains ungrouped (standalone)
This gives operators explicit control over rule precedence, similar to firewall rules or routing tables.
---
### Scope & Implementation
**Q8: Should we implement backend only or both backend and frontend?**
Backend and Frontend. Please do not implement any database migrations. I will do that manually.
**Q9: What existing patterns in the codebase should we follow?**
Look at these existing features for patterns:
- **Alert model and workflows** - Base patterns for state management
- **Incident management** (if exists) - Similar grouping/aggregation concepts
- **On-Call Policy execution** - Notification routing patterns
- **Scheduled Jobs/Workers** - Pattern for background job implementation
- **CRUD APIs** - Standard API patterns for the new models
---
### Worker Jobs
**Q10: What should the inactivity period be for EpisodeBreakInactive?**
Make it **configurable per rule** with sensible defaults:
- **Default**: 60 minutes of inactivity
- **Configurable range**: 5 minutes to 24 hours
- **Per-rule setting**: `inactivityTimeoutMinutes` on `AlertGroupingRule`
The worker job should run every 1-5 minutes, checking for episodes that have exceeded their inactivity threshold.
| Scenario | Recommended Timeout |
|----------|---------------------|
| High-frequency alerts (metrics) | 5-15 minutes |
| Standard monitoring | 30-60 minutes |
| Low-frequency events | 2-4 hours |
| Maintenance windows | 12-24 hours |
---
### Episode Title Generation
**Q11: When an episode is auto-created by a rule, how should the title be generated?**
**Recommendation**: Use a two-tier approach:
1. **Default**: Use the first alert's title as the episode title
2. **Optional override**: Allow a template on the `AlertGroupingRule` for custom naming
Template variables could include:
- `{alertTitle}` - First alert's title
- `{monitorName}` - Monitor/service name
- `{alertSeverity}` - Severity level
- `{alertCount}` - Number of alerts (updated dynamically)
Example templates:
- `"{alertSeverity} issues on {monitorName}"` → "Critical issues on API Server"
- `"{monitorName} - {alertTitle}"` → "Database - Connection timeout"
If no template is specified on the rule, default to the first alert's title.
---
### Manual Management
**Q12: If an alert is removed from an episode, what happens to it?**
The alert becomes **standalone** (ungrouped). The user can then optionally move it to a different episode manually. The alert's `episodeId` is set to null.
---
### State Synchronization
**Q13: If I manually resolve an episode, should all member alerts also be resolved?**
**Yes** - resolving an episode resolves all member alerts. This mirrors the acknowledge behavior: episode state changes cascade down to all member alerts for consistency.
State cascade rules:
- **Acknowledge episode** → Acknowledge all member alerts
- **Resolve episode** → Resolve all member alerts
---
### Grouping Combinations
**Q14: Can a single rule use BOTH Time Window AND Field-Based grouping together?**
**Yes** - rules can combine both grouping types. For example:
- "Group alerts from the **same monitor** that occur **within 10 minutes** of each other"
- "Group alerts with the **same severity and labels** within a **30-minute window**"
Both conditions must be satisfied for alerts to be grouped together.
---
### Alert Eligibility
**Q15: Can only Active alerts be grouped into episodes, or can alerts in any state be grouped?**
Alerts in **any state** can be grouped into episodes. This allows:
- Grouping historical alerts for post-incident analysis
- Manual organization of already-resolved alerts
- Flexibility in episode management regardless of alert lifecycle stage
---
### Episode Ownership
**Q16: What ownership fields should AlertEpisode have?**
- **Assigned User** - Individual user responsible for the episode
- **Assigned Team** - Team responsible for the episode
Both are optional and can be set manually or inherited from the grouping rule configuration.
---
### Episode Severity
**Q17: How should episode severity be determined?**
Use a **high-water mark with manual override** approach:
1. **Initial**: Set to first alert's severity
2. **Auto-escalate**: When a new alert joins, if its severity > current episode severity → update episode to higher severity
3. **Never auto-downgrade**: If lower severity alert joins → keep current episode severity
4. **Manual override allowed**: User can edit severity at any time
5. **Override respected until escalation**: If user sets to "Warning" but a "Critical" alert joins → escalate to "Critical"
This ensures users are always alerted to the worst-case scenario while respecting manual judgment when appropriate.
---
### Root Cause Field
**Q18: What is the structure of the root cause field?**
Simple **text field** on AlertEpisode. Users can document their root cause analysis as free-form text. Future enhancement: AI-assisted summarization based on alert data.
---
### Flapping Prevention Configuration
**Q19: Where are flapping prevention settings configured?**
**Per-rule** on AlertGroupingRule:
- `resolveDelayMinutes` - Grace period before auto-resolving
- `reopenWindowMinutes` - Window after resolution where episode can be reopened
Each rule can have different flapping prevention settings based on the type of alerts it handles.
---
### Manual Episode Creation
**Q20: Can users create episodes manually without a grouping rule?**
**Yes** - users can manually create episodes and add alerts to them. This allows:
- Ad-hoc grouping for incidents that don't match existing rules
- Post-incident organization of related alerts
- Flexibility for edge cases not covered by automated rules
---
### Episode Deletion
**Q21: Can episodes be deleted? What happens to member alerts?**
**Yes** - episodes can be deleted, but alerts must be **removed first** to make them standalone. This is a safety measure to prevent accidental data loss. The deletion flow:
1. User removes all alerts from episode (alerts become standalone)
2. User can then delete the empty episode
Alternatively, if alerts are still in the episode when deleted, they become standalone automatically.
---
### UI Location
**Q22: Where should Episodes appear in the navigation?**
New **sidemenu item** in the Alerts section. Episodes should have their own dedicated list page accessible from the main navigation, similar to how Alerts have their own page.
---
### Alert-to-Episode Relationship
**Q23: Can an alert belong to multiple episodes?**
**No** - an alert can only belong to **one episode at a time** (single ownership). This provides:
- Simpler mental model for users
- Clear state cascade without conflicts
- Industry-standard approach (PagerDuty, Opsgenie)
- Cleaner queries and data management
The `episodeId` field on Alert is singular and nullable.

View File

@@ -0,0 +1,774 @@
# Data Models for Alert Storm Detection
## Overview
This document defines the database models required for Alert Storm Detection and Noise Reduction Analytics functionality.
## Entity Relationship Diagram
```
┌─────────────────────────┐
│ AlertStormEvent │
├─────────────────────────┤
│ id │
│ projectId │
│ status (active/resolved)│
│ startedAt │
│ endedAt │
│ peakAlertRate │
│ normalAlertRate │
│ multiplier │
│ affectedMonitors (JSON) │
│ totalAlertsInStorm │
└─────────────────────────┘
┌─────────────────────────┐
│ NoiseReductionMetric │
├─────────────────────────┤
│ id │
│ projectId │
│ date │
│ totalAlerts │
│ deduplicated │
│ suppressed │
│ grouped │
│ notificationsSent │
│ noiseReductionPercent │
└─────────────────────────┘
┌─────────────────────────┐
│ AlertVolumeSnapshot │
├─────────────────────────┤
│ id │
│ projectId │
│ timestamp │
│ alertCount │
│ intervalMinutes │
└─────────────────────────┘
```
---
## Model Definitions
### 1. AlertStormEvent
Records storm events for tracking and analysis.
**File Location:** `/Common/Models/DatabaseModels/AlertStormEvent.ts`
```typescript
import {
Column,
Entity,
Index,
JoinColumn,
ManyToOne,
} from 'typeorm';
import BaseModel from './DatabaseBaseModel/DatabaseBaseModel';
import Project from './Project';
import ObjectID from 'Common/Types/ObjectID';
import ColumnType from 'Common/Types/Database/ColumnType';
import TableColumnType from 'Common/Types/Database/TableColumnType';
import Permission from 'Common/Types/Permission';
import IconProp from 'Common/Types/Icon/IconProp';
export enum StormStatus {
Active = 'active',
Resolved = 'resolved',
}
export enum StormSeverity {
Elevated = 'elevated', // 2x - 3x normal
Storm = 'storm', // 3x - 5x normal
Critical = 'critical', // > 5x normal
}
export interface AffectedMonitor {
monitorId: string;
monitorName: string;
alertCount: number;
}
@EnableDocumentation()
@TableAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
delete: [Permission.ProjectOwner, Permission.ProjectAdmin],
})
@CrudApiEndpoint(new Route('/alert-storm-event'))
@TableMetadata({
tableName: 'AlertStormEvent',
singularName: 'Storm Event',
pluralName: 'Storm Events',
icon: IconProp.Alert,
tableDescription: 'Records of alert storm events',
})
@Entity({
name: 'AlertStormEvent',
})
export default class AlertStormEvent extends BaseModel {
// ─────────────────────────────────────────────────────────────
// PROJECT
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Entity,
modelType: Project,
title: 'Project',
})
@ManyToOne(() => Project, {
onDelete: 'CASCADE',
orphanedRowAction: 'delete',
})
@JoinColumn({ name: 'projectId' })
public project?: Project = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.ObjectID,
title: 'Project ID',
})
@Column({
type: ColumnType.ObjectID,
nullable: false,
})
@Index()
public projectId?: ObjectID = undefined;
// ─────────────────────────────────────────────────────────────
// STATUS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.ShortText,
title: 'Status',
description: 'Current status of the storm',
})
@Column({
type: ColumnType.ShortText,
length: 20,
nullable: false,
})
@Index()
public status?: StormStatus = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.ShortText,
title: 'Severity',
description: 'Severity level of the storm',
})
@Column({
type: ColumnType.ShortText,
length: 20,
nullable: false,
})
public severity?: StormSeverity = undefined;
// ─────────────────────────────────────────────────────────────
// TIMING
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Started At',
description: 'When the storm was first detected',
})
@Column({
type: ColumnType.Date,
nullable: false,
})
@Index()
public startedAt?: Date = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Ended At',
description: 'When the storm ended',
})
@Column({
type: ColumnType.Date,
nullable: true,
})
public endedAt?: Date = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Duration Minutes',
description: 'Total duration of the storm in minutes',
})
@Column({
type: ColumnType.Number,
nullable: true,
})
public durationMinutes?: number = undefined;
// ─────────────────────────────────────────────────────────────
// METRICS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Peak Alert Rate',
description: 'Peak alerts per hour during storm',
})
@Column({
type: ColumnType.Number,
nullable: false,
})
public peakAlertRate?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Normal Alert Rate',
description: 'Normal alerts per hour (baseline)',
})
@Column({
type: ColumnType.Number,
nullable: false,
})
public normalAlertRate?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Multiplier',
description: 'How many times normal the peak rate was',
})
@Column({
type: ColumnType.Decimal,
precision: 5,
scale: 2,
nullable: false,
})
public multiplier?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Total Alerts',
description: 'Total alerts during the storm',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public totalAlertsInStorm?: number = undefined;
// ─────────────────────────────────────────────────────────────
// AFFECTED MONITORS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.JSON,
title: 'Affected Monitors',
description: 'Top monitors contributing to the storm',
})
@Column({
type: ColumnType.JSON,
nullable: true,
})
public affectedMonitors?: Array<AffectedMonitor> = undefined;
// ─────────────────────────────────────────────────────────────
// SUPPRESSION
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Boolean,
title: 'Emergency Suppression Active',
description: 'Whether emergency suppression was activated',
})
@Column({
type: ColumnType.Boolean,
nullable: false,
default: false,
})
public emergencySuppressionActive?: boolean = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Suppressed During Storm',
description: 'Alerts suppressed during storm (if emergency suppression active)',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public suppressedDuringStorm?: number = undefined;
}
```
---
### 2. NoiseReductionMetric
Daily metrics for noise reduction analytics.
**File Location:** `/Common/Models/DatabaseModels/NoiseReductionMetric.ts`
```typescript
@TableMetadata({
tableName: 'NoiseReductionMetric',
singularName: 'Noise Reduction Metric',
pluralName: 'Noise Reduction Metrics',
icon: IconProp.ChartBar,
tableDescription: 'Daily noise reduction statistics',
})
@Entity({
name: 'NoiseReductionMetric',
})
export default class NoiseReductionMetric extends BaseModel {
// ─────────────────────────────────────────────────────────────
// PROJECT
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.ObjectID,
title: 'Project ID',
})
@Column({
type: ColumnType.ObjectID,
nullable: false,
})
@Index()
public projectId?: ObjectID = undefined;
// ─────────────────────────────────────────────────────────────
// DATE
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Date',
description: 'Date for these metrics',
})
@Column({
type: ColumnType.Date,
nullable: false,
})
@Index()
public date?: Date = undefined;
// ─────────────────────────────────────────────────────────────
// ALERT COUNTS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Total Alert Triggers',
description: 'Total number of alert triggers (before noise reduction)',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public totalAlertTriggers?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Alerts Created',
description: 'Actual alerts created (after deduplication/suppression)',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public alertsCreated?: number = undefined;
// ─────────────────────────────────────────────────────────────
// DEDUPLICATION METRICS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Deduplicated',
description: 'Alerts prevented by deduplication',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public deduplicated?: number = undefined;
// ─────────────────────────────────────────────────────────────
// SUPPRESSION METRICS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Suppressed',
description: 'Alerts prevented by suppression rules',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public suppressed?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Suppressed by Maintenance',
description: 'Alerts suppressed by maintenance windows',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public suppressedByMaintenance?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Suppressed by Rate Limit',
description: 'Alerts suppressed by rate limits',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public suppressedByRateLimit?: number = undefined;
// ─────────────────────────────────────────────────────────────
// GROUPING METRICS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Alerts Grouped',
description: 'Alerts grouped into episodes',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public alertsGrouped?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Episodes Created',
description: 'Number of episodes created',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public episodesCreated?: number = undefined;
// ─────────────────────────────────────────────────────────────
// NOTIFICATION METRICS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Notifications Sent',
description: 'Actual notifications sent (after all filtering)',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public notificationsSent?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Notifications Suppressed',
description: 'Notifications that were suppressed',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 0,
})
public notificationsSuppressed?: number = undefined;
// ─────────────────────────────────────────────────────────────
// CALCULATED METRICS
// ─────────────────────────────────────────────────────────────
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Noise Reduction Percent',
description: 'Overall noise reduction percentage',
})
@Column({
type: ColumnType.Decimal,
precision: 5,
scale: 2,
nullable: false,
default: 0,
})
public noiseReductionPercent?: number = undefined;
}
```
---
### 3. AlertVolumeSnapshot
Periodic snapshots of alert volume for trend analysis.
**File Location:** `/Common/Models/DatabaseModels/AlertVolumeSnapshot.ts`
```typescript
@TableMetadata({
tableName: 'AlertVolumeSnapshot',
singularName: 'Volume Snapshot',
pluralName: 'Volume Snapshots',
icon: IconProp.ChartLine,
tableDescription: 'Periodic alert volume snapshots',
})
@Entity({
name: 'AlertVolumeSnapshot',
})
export default class AlertVolumeSnapshot extends BaseModel {
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.ObjectID,
title: 'Project ID',
})
@Column({
type: ColumnType.ObjectID,
nullable: false,
})
@Index()
public projectId?: ObjectID = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Date,
title: 'Timestamp',
description: 'When this snapshot was taken',
})
@Column({
type: ColumnType.Date,
nullable: false,
})
@Index()
public timestamp?: Date = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Alert Count',
description: 'Number of alerts in this interval',
})
@Column({
type: ColumnType.Number,
nullable: false,
})
public alertCount?: number = undefined;
@ColumnAccessControl({
create: [],
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
update: [],
})
@TableColumn({
type: TableColumnType.Number,
title: 'Interval Minutes',
description: 'Interval size in minutes (e.g., 5, 15, 60)',
})
@Column({
type: ColumnType.Number,
nullable: false,
default: 5,
})
public intervalMinutes?: number = undefined;
}
```
---
## Database Indexes
```sql
-- AlertStormEvent indexes
CREATE INDEX idx_storm_event_project_status
ON "AlertStormEvent" ("projectId", "status", "startedAt" DESC);
CREATE INDEX idx_storm_event_active
ON "AlertStormEvent" ("projectId", "status")
WHERE "status" = 'active';
-- NoiseReductionMetric indexes
CREATE INDEX idx_noise_metric_project_date
ON "NoiseReductionMetric" ("projectId", "date" DESC);
CREATE UNIQUE INDEX idx_noise_metric_unique
ON "NoiseReductionMetric" ("projectId", "date");
-- AlertVolumeSnapshot indexes
CREATE INDEX idx_volume_snapshot_project_time
ON "AlertVolumeSnapshot" ("projectId", "timestamp" DESC);
-- Partition by time for efficient cleanup
-- Consider partitioning AlertVolumeSnapshot by month
```
---
## Implementation Checklist
- [ ] Create AlertStormEvent model
- [ ] Create NoiseReductionMetric model
- [ ] Create AlertVolumeSnapshot model
- [ ] Register models in model registry
- [ ] Create database migrations
- [ ] Add indexes
- [ ] Update API permissions

View File

@@ -0,0 +1,630 @@
# Backend Implementation for Alert Storm Detection
## Overview
This document details the backend services and components required for Alert Storm Detection and Noise Reduction Analytics.
## Core Components
### 1. StormDetector
Main service for detecting alert storms.
**File Location:** `/Common/Server/Utils/Alert/StormDetector.ts`
```typescript
import AlertService from '../../Services/AlertService';
import AlertStormEventService from '../../Services/AlertStormEventService';
import AlertStormEvent, {
StormStatus,
StormSeverity,
AffectedMonitor,
} from '../../Models/DatabaseModels/AlertStormEvent';
import ObjectID from 'Common/Types/ObjectID';
import OneUptimeDate from 'Common/Types/Date';
import QueryHelper from '../../Types/Database/QueryHelper';
export interface StormStatus {
isStorm: boolean;
severity: StormSeverity | null;
currentRate: number;
normalRate: number;
multiplier: number;
affectedMonitors?: Array<AffectedMonitor>;
activeStormEvent?: AlertStormEvent;
}
export interface StormConfig {
// Multiplier threshold for storm detection
stormThreshold: number; // Default: 3
// Multiplier threshold for critical storm
criticalThreshold: number; // Default: 5
// Minimum alerts per hour to consider for storm
minimumAlertRate: number; // Default: 10
// Historical lookback hours for baseline
baselineHours: number; // Default: 24
// Enable emergency suppression
enableEmergencySuppression: boolean; // Default: false
}
export const DEFAULT_STORM_CONFIG: StormConfig = {
stormThreshold: 3,
criticalThreshold: 5,
minimumAlertRate: 10,
baselineHours: 24,
enableEmergencySuppression: false,
};
export default class StormDetector {
/**
* Check current storm status for a project
*/
public static async checkStatus(
projectId: ObjectID,
config?: Partial<StormConfig>
): Promise<StormStatus> {
const mergedConfig = { ...DEFAULT_STORM_CONFIG, ...config };
const now = new Date();
const oneHourAgo = OneUptimeDate.addRemoveHours(now, -1);
const baselineStart = OneUptimeDate.addRemoveHours(now, -mergedConfig.baselineHours);
// Get current hour's alert count
const currentCount = await AlertService.countBy({
query: {
projectId,
createdAt: QueryHelper.greaterThan(oneHourAgo),
},
props: { isRoot: true },
});
// Get historical average (excluding current hour)
const historicalCount = await AlertService.countBy({
query: {
projectId,
createdAt: QueryHelper.between(baselineStart, oneHourAgo),
},
props: { isRoot: true },
});
const hoursInBaseline = mergedConfig.baselineHours - 1;
const normalRate = hoursInBaseline > 0
? historicalCount / hoursInBaseline
: mergedConfig.minimumAlertRate;
const currentRate = currentCount;
const multiplier = normalRate > 0 ? currentRate / normalRate : currentRate;
// Determine storm status
let isStorm = false;
let severity: StormSeverity | null = null;
if (multiplier >= mergedConfig.criticalThreshold) {
isStorm = true;
severity = StormSeverity.Critical;
} else if (multiplier >= mergedConfig.stormThreshold) {
isStorm = true;
severity = StormSeverity.Storm;
} else if (multiplier >= 2) {
severity = StormSeverity.Elevated;
}
// Only consider it a storm if rate is above minimum
if (currentRate < mergedConfig.minimumAlertRate) {
isStorm = false;
severity = null;
}
// Get affected monitors if storm
let affectedMonitors: Array<AffectedMonitor> | undefined;
if (isStorm) {
affectedMonitors = await this.getTopAlertingMonitors(projectId, oneHourAgo);
}
// Check for active storm event
const activeStormEvent = await AlertStormEventService.findOneBy({
query: {
projectId,
status: StormStatus.Active,
},
select: {
_id: true,
startedAt: true,
peakAlertRate: true,
},
props: { isRoot: true },
});
return {
isStorm,
severity,
currentRate,
normalRate: Math.round(normalRate * 100) / 100,
multiplier: Math.round(multiplier * 100) / 100,
affectedMonitors,
activeStormEvent: activeStormEvent || undefined,
};
}
/**
* Get top alerting monitors
*/
private static async getTopAlertingMonitors(
projectId: ObjectID,
since: Date
): Promise<Array<AffectedMonitor>> {
const result = await AlertService.aggregate({
pipeline: [
{
$match: {
projectId: projectId.toString(),
createdAt: { $gte: since },
monitorId: { $ne: null },
},
},
{
$group: {
_id: '$monitorId',
count: { $sum: 1 },
},
},
{ $sort: { count: -1 } },
{ $limit: 10 },
],
});
// Get monitor names
const monitorIds = result.map((r) => new ObjectID(r._id));
const monitors = await MonitorService.findBy({
query: {
_id: QueryHelper.any(monitorIds),
},
select: { _id: true, name: true },
props: { isRoot: true },
});
const monitorMap = new Map(
monitors.map((m) => [m.id?.toString(), m.name])
);
return result.map((r) => ({
monitorId: r._id,
monitorName: monitorMap.get(r._id) || 'Unknown',
alertCount: r.count,
}));
}
/**
* Process storm detection and create/update storm events
*/
public static async processStormDetection(
projectId: ObjectID,
config?: Partial<StormConfig>
): Promise<void> {
const status = await this.checkStatus(projectId, config);
if (status.isStorm && !status.activeStormEvent) {
// New storm detected - create event
await this.createStormEvent(projectId, status);
} else if (status.isStorm && status.activeStormEvent) {
// Storm ongoing - update event
await this.updateStormEvent(status.activeStormEvent.id!, status);
} else if (!status.isStorm && status.activeStormEvent) {
// Storm ended - resolve event
await this.resolveStormEvent(status.activeStormEvent.id!);
}
}
/**
* Create a new storm event
*/
private static async createStormEvent(
projectId: ObjectID,
status: StormStatus
): Promise<AlertStormEvent> {
const event = await AlertStormEventService.create({
data: {
projectId,
status: StormStatus.Active,
severity: status.severity!,
startedAt: new Date(),
peakAlertRate: status.currentRate,
normalAlertRate: status.normalRate,
multiplier: status.multiplier,
affectedMonitors: status.affectedMonitors,
totalAlertsInStorm: status.currentRate,
} as AlertStormEvent,
props: { isRoot: true },
});
// Send notifications
await NotificationService.sendStormStartNotification({
projectId,
stormEvent: event,
});
logger.info(`Storm detected for project ${projectId}: ${status.multiplier}x normal`);
return event;
}
/**
* Update an ongoing storm event
*/
private static async updateStormEvent(
eventId: ObjectID,
status: StormStatus
): Promise<void> {
const event = await AlertStormEventService.findOneById({
id: eventId,
select: { peakAlertRate: true, totalAlertsInStorm: true },
props: { isRoot: true },
});
if (!event) return;
await AlertStormEventService.updateOneById({
id: eventId,
data: {
peakAlertRate: Math.max(event.peakAlertRate || 0, status.currentRate),
multiplier: Math.max(event.multiplier || 0, status.multiplier),
totalAlertsInStorm: (event.totalAlertsInStorm || 0) + status.currentRate,
affectedMonitors: status.affectedMonitors,
},
props: { isRoot: true },
});
}
/**
* Resolve a storm event
*/
private static async resolveStormEvent(eventId: ObjectID): Promise<void> {
const event = await AlertStormEventService.findOneById({
id: eventId,
select: { startedAt: true, projectId: true },
props: { isRoot: true },
});
if (!event) return;
const now = new Date();
const durationMinutes = Math.round(
(now.getTime() - event.startedAt!.getTime()) / 60000
);
await AlertStormEventService.updateOneById({
id: eventId,
data: {
status: StormStatus.Resolved,
endedAt: now,
durationMinutes,
},
props: { isRoot: true },
});
// Send notification
await NotificationService.sendStormEndNotification({
projectId: event.projectId!,
stormEventId: eventId,
durationMinutes,
});
logger.info(`Storm resolved for project ${event.projectId} after ${durationMinutes} minutes`);
}
}
import MonitorService from '../../Services/MonitorService';
import NotificationService from '../../Services/NotificationService';
import logger from '../../Utils/Logger';
```
---
### 2. NoiseReductionAnalytics
Service for calculating and retrieving noise reduction metrics.
**File Location:** `/Common/Server/Utils/Alert/NoiseReductionAnalytics.ts`
```typescript
import NoiseReductionMetric from '../../Models/DatabaseModels/NoiseReductionMetric';
import NoiseReductionMetricService from '../../Services/NoiseReductionMetricService';
import AlertService from '../../Services/AlertService';
import SuppressedAlertLogService from '../../Services/SuppressedAlertLogService';
import AlertFingerprintService from '../../Services/AlertFingerprintService';
import AlertEpisodeService from '../../Services/AlertEpisodeService';
import ObjectID from 'Common/Types/ObjectID';
import OneUptimeDate from 'Common/Types/Date';
import QueryHelper from '../../Types/Database/QueryHelper';
export interface NoiseReductionSummary {
period: {
startDate: Date;
endDate: Date;
};
totalAlertTriggers: number;
alertsCreated: number;
deduplicated: number;
suppressed: number;
grouped: number;
notificationsSent: number;
noiseReductionPercent: number;
}
export default class NoiseReductionAnalytics {
/**
* Calculate daily noise reduction metrics for a project
*/
public static async calculateDailyMetrics(
projectId: ObjectID,
date: Date
): Promise<NoiseReductionMetric> {
const startOfDay = OneUptimeDate.getStartOfDay(date);
const endOfDay = OneUptimeDate.getEndOfDay(date);
// Count alerts created
const alertsCreated = await AlertService.countBy({
query: {
projectId,
createdAt: QueryHelper.between(startOfDay, endOfDay),
},
props: { isRoot: true },
});
// Count deduplicated
const fingerprints = await AlertFingerprintService.findBy({
query: {
projectId,
windowStartAt: QueryHelper.between(startOfDay, endOfDay),
},
select: { duplicateCount: true },
props: { isRoot: true },
});
const deduplicated = fingerprints.reduce(
(sum, fp) => sum + (fp.duplicateCount || 0),
0
);
// Count suppressed
const suppressed = await SuppressedAlertLogService.countBy({
query: {
projectId,
suppressedAt: QueryHelper.between(startOfDay, endOfDay),
},
props: { isRoot: true },
});
// Count grouped alerts
const alertsGrouped = await AlertService.countBy({
query: {
projectId,
createdAt: QueryHelper.between(startOfDay, endOfDay),
episodeId: QueryHelper.notNull(),
},
props: { isRoot: true },
});
// Count episodes created
const episodesCreated = await AlertEpisodeService.countBy({
query: {
projectId,
startedAt: QueryHelper.between(startOfDay, endOfDay),
},
props: { isRoot: true },
});
// Calculate totals
const totalAlertTriggers = alertsCreated + deduplicated + suppressed;
const noiseReductionPercent = totalAlertTriggers > 0
? ((deduplicated + suppressed) / totalAlertTriggers) * 100
: 0;
// Create or update metric
const existingMetric = await NoiseReductionMetricService.findOneBy({
query: {
projectId,
date: startOfDay,
},
select: { _id: true },
props: { isRoot: true },
});
const metricData: Partial<NoiseReductionMetric> = {
projectId,
date: startOfDay,
totalAlertTriggers,
alertsCreated,
deduplicated,
suppressed,
alertsGrouped,
episodesCreated,
noiseReductionPercent: Math.round(noiseReductionPercent * 100) / 100,
};
if (existingMetric) {
await NoiseReductionMetricService.updateOneById({
id: existingMetric.id!,
data: metricData,
props: { isRoot: true },
});
return { ...existingMetric, ...metricData } as NoiseReductionMetric;
}
return await NoiseReductionMetricService.create({
data: metricData as NoiseReductionMetric,
props: { isRoot: true },
});
}
/**
* Get noise reduction summary for a date range
*/
public static async getSummary(
projectId: ObjectID,
startDate: Date,
endDate: Date
): Promise<NoiseReductionSummary> {
const metrics = await NoiseReductionMetricService.findBy({
query: {
projectId,
date: QueryHelper.between(startDate, endDate),
},
select: {
totalAlertTriggers: true,
alertsCreated: true,
deduplicated: true,
suppressed: true,
alertsGrouped: true,
notificationsSent: true,
},
props: { isRoot: true },
});
const totals = metrics.reduce(
(acc, m) => ({
totalAlertTriggers: acc.totalAlertTriggers + (m.totalAlertTriggers || 0),
alertsCreated: acc.alertsCreated + (m.alertsCreated || 0),
deduplicated: acc.deduplicated + (m.deduplicated || 0),
suppressed: acc.suppressed + (m.suppressed || 0),
grouped: acc.grouped + (m.alertsGrouped || 0),
notificationsSent: acc.notificationsSent + (m.notificationsSent || 0),
}),
{
totalAlertTriggers: 0,
alertsCreated: 0,
deduplicated: 0,
suppressed: 0,
grouped: 0,
notificationsSent: 0,
}
);
const noiseReductionPercent = totals.totalAlertTriggers > 0
? ((totals.deduplicated + totals.suppressed) / totals.totalAlertTriggers) * 100
: 0;
return {
period: { startDate, endDate },
...totals,
noiseReductionPercent: Math.round(noiseReductionPercent * 100) / 100,
};
}
}
```
---
### 3. Worker Jobs
#### Storm Monitor Job
**File Location:** `/Worker/Jobs/AlertStorm/Monitor.ts`
```typescript
import RunCron from '../../Utils/Cron';
import { EVERY_FIVE_MINUTES } from 'Common/Utils/CronTime';
import StormDetector from 'Common/Server/Utils/Alert/StormDetector';
import ProjectService from 'Common/Server/Services/ProjectService';
RunCron(
'AlertStorm:Monitor',
{ schedule: EVERY_FIVE_MINUTES, runOnStartup: false },
async () => {
// Get all active projects
const projects = await ProjectService.findBy({
query: { isBlocked: false },
select: { _id: true },
limit: 1000,
props: { isRoot: true },
});
for (const project of projects) {
try {
await StormDetector.processStormDetection(project.id!);
} catch (error) {
logger.error(
`Error processing storm detection for project ${project.id}:`,
error
);
}
}
}
);
```
#### Daily Metrics Job
**File Location:** `/Worker/Jobs/NoiseReduction/DailyMetrics.ts`
```typescript
import RunCron from '../../Utils/Cron';
import { EVERY_DAY_AT_MIDNIGHT } from 'Common/Utils/CronTime';
import NoiseReductionAnalytics from 'Common/Server/Utils/Alert/NoiseReductionAnalytics';
import ProjectService from 'Common/Server/Services/ProjectService';
import OneUptimeDate from 'Common/Types/Date';
RunCron(
'NoiseReduction:DailyMetrics',
{ schedule: EVERY_DAY_AT_MIDNIGHT, runOnStartup: false },
async () => {
// Calculate metrics for yesterday
const yesterday = OneUptimeDate.addRemoveDays(
OneUptimeDate.getCurrentDate(),
-1
);
const projects = await ProjectService.findBy({
query: { isBlocked: false },
select: { _id: true },
limit: 1000,
props: { isRoot: true },
});
for (const project of projects) {
try {
await NoiseReductionAnalytics.calculateDailyMetrics(
project.id!,
yesterday
);
} catch (error) {
logger.error(
`Error calculating metrics for project ${project.id}:`,
error
);
}
}
logger.info(`Calculated daily noise reduction metrics for ${projects.length} projects`);
}
);
```
---
## Implementation Checklist
### Phase 1: Storm Detection
- [ ] Create StormDetector utility
- [ ] Create AlertStormEventService
- [ ] Implement storm detection algorithm
- [ ] Create storm monitor worker job
### Phase 2: Notifications
- [ ] Storm start notification
- [ ] Storm end notification
- [ ] Admin notification integration
### Phase 3: Analytics
- [ ] Create NoiseReductionAnalytics utility
- [ ] Create NoiseReductionMetricService
- [ ] Implement daily metrics calculation
- [ ] Create daily metrics worker job
### Phase 4: Testing
- [ ] Unit tests for StormDetector
- [ ] Unit tests for NoiseReductionAnalytics
- [ ] Integration tests for worker jobs

View File

@@ -0,0 +1,222 @@
# API Design for Alert Storm Detection
## Overview
This document defines the REST API endpoints for Alert Storm Detection and Noise Reduction Analytics.
## Storm Events API
### Get Current Storm Status
```http
GET /api/project/{projectId}/alert-storm/status
```
**Response:**
```json
{
"isStorm": true,
"severity": "storm",
"currentRate": 150,
"normalRate": 30,
"multiplier": 5.0,
"affectedMonitors": [
{ "monitorId": "mon-1", "monitorName": "mysql-prod", "alertCount": 45 },
{ "monitorId": "mon-2", "monitorName": "api-gateway", "alertCount": 32 }
],
"activeStormEvent": {
"_id": "storm-event-1",
"startedAt": "2026-01-20T10:00:00Z",
"peakAlertRate": 180,
"durationMinutes": 45
}
}
```
### List Storm Events
```http
GET /api/project/{projectId}/alert-storm-event
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `status` | string | Filter by status (active, resolved) |
| `startedAt` | DateRange | Filter by start date |
| `limit` | number | Results per page |
| `skip` | number | Pagination offset |
**Response:**
```json
{
"data": [
{
"_id": "storm-1",
"status": "resolved",
"severity": "critical",
"startedAt": "2026-01-19T14:00:00Z",
"endedAt": "2026-01-19T15:30:00Z",
"durationMinutes": 90,
"peakAlertRate": 250,
"normalAlertRate": 30,
"multiplier": 8.33,
"totalAlertsInStorm": 450,
"affectedMonitors": [...]
}
],
"count": 15
}
```
### Get Storm Event Details
```http
GET /api/project/{projectId}/alert-storm-event/{eventId}
```
---
## Noise Reduction Analytics API
### Get Noise Reduction Summary
```http
GET /api/project/{projectId}/noise-reduction/summary
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `startDate` | Date | Start of period |
| `endDate` | Date | End of period |
**Response:**
```json
{
"period": {
"startDate": "2026-01-13T00:00:00Z",
"endDate": "2026-01-20T00:00:00Z"
},
"totalAlertTriggers": 10000,
"alertsCreated": 3500,
"deduplicated": 4000,
"suppressed": 2500,
"grouped": 1500,
"notificationsSent": 2000,
"noiseReductionPercent": 65.0,
"breakdown": {
"byDeduplication": 40.0,
"bySuppression": 25.0
}
}
```
### Get Daily Metrics
```http
GET /api/project/{projectId}/noise-reduction/daily
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `startDate` | Date | Start of period |
| `endDate` | Date | End of period |
**Response:**
```json
{
"data": [
{
"date": "2026-01-20",
"totalAlertTriggers": 1500,
"alertsCreated": 500,
"deduplicated": 600,
"suppressed": 400,
"alertsGrouped": 200,
"episodesCreated": 15,
"noiseReductionPercent": 66.67
},
{
"date": "2026-01-19",
"totalAlertTriggers": 1200,
"alertsCreated": 450
}
]
}
```
### Get Top Noise Sources
```http
GET /api/project/{projectId}/noise-reduction/top-sources
```
**Response:**
```json
{
"byMonitor": [
{ "monitorId": "mon-1", "monitorName": "mysql-prod", "alertCount": 500, "duplicateCount": 300 },
{ "monitorId": "mon-2", "monitorName": "api-gateway", "alertCount": 350, "duplicateCount": 150 }
],
"bySeverity": [
{ "severityId": "sev-1", "severityName": "Warning", "alertCount": 600 },
{ "severityId": "sev-2", "severityName": "Critical", "alertCount": 200 }
]
}
```
---
## Storm Configuration API
### Get Storm Config
```http
GET /api/project/{projectId}/alert-storm/config
```
**Response:**
```json
{
"stormThreshold": 3,
"criticalThreshold": 5,
"minimumAlertRate": 10,
"baselineHours": 24,
"enableEmergencySuppression": false,
"notifyOnStormStart": true,
"notifyOnStormEnd": true
}
```
### Update Storm Config
```http
PUT /api/project/{projectId}/alert-storm/config
```
---
## Implementation Checklist
### Storm API
- [ ] GET /alert-storm/status
- [ ] GET /alert-storm-event (list)
- [ ] GET /alert-storm-event/:id
- [ ] GET /alert-storm/config
- [ ] PUT /alert-storm/config
### Analytics API
- [ ] GET /noise-reduction/summary
- [ ] GET /noise-reduction/daily
- [ ] GET /noise-reduction/top-sources

View File

@@ -0,0 +1,519 @@
# UI Implementation for Alert Storm Detection
## Overview
This document details the frontend components and pages required for Alert Storm Detection and Noise Reduction Analytics functionality.
## Navigation Structure
```
Dashboard
├── Alerts
│ ├── Alerts (existing)
│ ├── Episodes (from Grouping plan)
│ └── Storm History (NEW)
└── Settings
└── Alerts
├── Alert States (existing)
├── Alert Severities (existing)
├── Grouping Rules
├── Suppression Rules
├── Deduplication
└── Storm Detection (NEW)
Analytics (NEW section or add to existing)
└── Noise Reduction Dashboard (NEW)
```
---
## Pages to Create
### 1. Storm Detection Settings Page
**File Location:** `/Dashboard/src/Pages/Settings/AlertStormDetection.tsx`
**Route:** `/dashboard/:projectId/settings/alert-storm-detection`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Settings > Alert Storm Detection │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Alert Storm Detection identifies when alert volume spikes abnormally above │ │
│ │ historical baselines. This helps identify major incidents and prevent alert │ │
│ │ fatigue during outages. │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ DETECTION THRESHOLDS │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ Storm Threshold (multiplier) │
│ ┌──────────┐ │
│ │ 3 │ x normal rate │
│ └──────────┘ │
│ Alert volume must exceed this multiplier to be considered a storm. │
│ │
│ Critical Storm Threshold (multiplier) │
│ ┌──────────┐ │
│ │ 5 │ x normal rate │
│ └──────────┘ │
│ Storms exceeding this multiplier are marked as critical. │
│ │
│ Minimum Alert Rate │
│ ┌──────────┐ │
│ │ 10 │ alerts per hour │
│ └──────────┘ │
│ Minimum baseline rate required before storm detection activates. │
│ │
│ Baseline Period │
│ ┌──────────┐ │
│ │ 24 │ hours │
│ └──────────┘ │
│ Historical period used to calculate normal alert rate. │
│ │
│ NOTIFICATIONS │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ☑ Notify when storm starts │
│ ☑ Notify when storm ends │
│ ☐ Enable emergency suppression during storms │
│ ⚠️ This will automatically suppress non-critical alerts during storms │
│ │
│ [Save Changes] │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 2. Storm History Page
**File Location:** `/Dashboard/src/Pages/Alerts/StormHistory.tsx`
**Route:** `/dashboard/:projectId/alerts/storm-history`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Alerts > Storm History │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ 🔴 ACTIVE STORM Started: 45 minutes ago │ │
│ │ │ │
│ │ Current Rate: 150 alerts/hour (5x normal) │ │
│ │ Peak Rate: 180 alerts/hour │ │
│ │ Affected Monitors: 12 │ │
│ │ │ │
│ │ [View Details] │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ STORM EVENTS [Filters ▼] │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌──────────┬──────────┬───────────┬──────────┬──────────┬──────────┬─────────────┐ │
│ │ Status │ Severity │ Started │ Duration │ Peak │ Alerts │ Monitors │ │
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
│ │ 🔴 Active│ Critical │ Today │ 45m │ 180/hr │ 450 │ 12 │ │
│ │ │ │ 10:15 AM │ │ (6x) │ │ │ │
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
│ │ ✅ Resv'd│ Storm │ Yesterday │ 1h 30m │ 120/hr │ 280 │ 8 │ │
│ │ │ │ 2:30 PM │ │ (4x) │ │ │ │
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
│ │ ✅ Resv'd│ Critical │ Jan 18 │ 2h 15m │ 250/hr │ 620 │ 15 │ │
│ │ │ │ 8:00 AM │ │ (8.3x) │ │ │ │
│ └──────────┴──────────┴───────────┴──────────┴──────────┴──────────┴─────────────┘ │
│ │
│ ◄ Previous Page 1 of 3 Next ► │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 3. Storm Event Detail Page
**File Location:** `/Dashboard/src/Pages/Alerts/StormEventDetail.tsx`
**Route:** `/dashboard/:projectId/alerts/storm-history/:stormEventId`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Storm History > Storm Event #storm-123 │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌───────────────────────────────────────┬─────────────────────────────────────────┐ │
│ │ STORM SUMMARY │ ALERT VOLUME TIMELINE │ │
│ │ │ │ │
│ │ Status: 🔴 Active │ 250 ─┬───────────────────────────── │ │
│ │ Severity: Critical │ │ ╭─────╮ │ │
│ │ Started: Jan 20, 2026 10:15 AM │ 200 ─┤ ╭╯ ╰╮ │ │
│ │ Duration: 45 minutes (ongoing) │ │ ╭╯ ╰╮ │ │
│ │ │ 150 ─┤ ╭╯ Peak ╰─current │ │
│ │ Peak Alert Rate: 180/hour │ │ ╭╯ 180/hr │ │
│ │ Normal Rate: 30/hour │ 100 ─┤ ╭╯ │ │
│ │ Multiplier: 6x │ │ ╭╯ │ │
│ │ │ 50 ─┤──╯ │ │
│ │ Total Alerts: 450 │ │ baseline: 30/hr ─ ─ ─ ─ ─ ─ │ │
│ │ Affected Monitors: 12 │ 0 ─┴───────────────────────────── │ │
│ │ │ 10:00 10:15 10:30 10:45 │ │
│ └───────────────────────────────────────┴─────────────────────────────────────────┘ │
│ │
│ TOP ALERTING MONITORS │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌──────────────────────────────────┬──────────┬──────────────────────────────────┐ │
│ │ Monitor │ Alerts │ Distribution │ │
│ ├──────────────────────────────────┼──────────┼──────────────────────────────────┤ │
│ │ 🖥️ mysql-prod-01 │ 85 │ ██████████████████░░ 19% │ │
│ │ 🖥️ api-gateway-main │ 72 │ ███████████████░░░░░ 16% │ │
│ │ 🖥️ redis-cluster-a │ 58 │ ████████████░░░░░░░░ 13% │ │
│ │ 🖥️ postgres-replica-02 │ 45 │ ██████████░░░░░░░░░░ 10% │ │
│ │ 🖥️ load-balancer-east │ 38 │ ████████░░░░░░░░░░░░ 8% │ │
│ └──────────────────────────────────┴──────────┴──────────────────────────────────┘ │
│ │
│ ALERTS IN THIS STORM [View All] │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌───────┬──────────────────────────────────────┬──────────┬──────────────────────┐ │
│ │ ID │ Title │ Monitor │ Time │ │
│ ├───────┼──────────────────────────────────────┼──────────┼──────────────────────┤ │
│ │ #1234 │ Connection timeout │ mysql-01 │ 10:45 AM │ │
│ │ #1233 │ Response time exceeded threshold │ api-gw │ 10:44 AM │ │
│ │ #1232 │ Memory usage critical │ redis-a │ 10:43 AM │ │
│ └───────┴──────────────────────────────────────┴──────────┴──────────────────────┘ │
│ │
│ Showing 3 of 450 alerts │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 4. Noise Reduction Analytics Dashboard
**File Location:** `/Dashboard/src/Pages/Analytics/NoiseReductionDashboard.tsx`
**Route:** `/dashboard/:projectId/analytics/noise-reduction`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Analytics > Noise Reduction [Last 7 Days ▼] │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ NOISE REDUCTION OVERVIEW │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ Alert Triggers │ │ Alerts Created │ │ Noise Reduced │ │ Notifications │ │
│ │ │ │ │ │ │ │ │ │
│ │ 10,000 │ │ 3,500 │ │ 65% │ │ 2,000 │ │
│ │ total │ │ created │ │ reduction │ │ sent │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
│ │
│ REDUCTION BREAKDOWN │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ │ │
│ │ Total Alert Triggers: 10,000 │ │
│ │ ├─────────────────────────────────────────────────────────────────────────────│ │
│ │ │ Deduplicated │ Suppressed │ Grouped │ Created │ │
│ │ │ 4,000 (40%) │ 2,500 (25%) │ 1,500 (15%)│ 3,500 (35%) │ │
│ │ │ ████████████████████ │ ████████████ │ ███████ │ ██████████████ │ │
│ │ └─────────────────────────────────────────────────────────────────────────────│ │
│ │ │ │
│ │ Legend: │ │
│ │ ■ Deduplicated - Merged with existing alerts │ │
│ │ ■ Suppressed - Blocked by suppression rules │ │
│ │ ■ Grouped - Added to existing episodes (reduced notifications) │ │
│ │ ■ Created - New unique alerts │ │
│ │ │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ DAILY TREND │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ 2000 ─┬───────────────────────────────────────────────────────────────── │ │
│ │ │ │ │
│ │ 1500 ─┤ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ │ │
│ │ │ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ │ │
│ │ 1000 ─┤ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓ │ │
│ │ │ ▓▓▓▓░░ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
│ │ 500 ─┤ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
│ │ │ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
│ │ 0 ─┴────────────────────────────────────────────────────────────────── │ │
│ │ Jan 14 Jan 15 Jan 16 Jan 17 Jan 18 Jan 19 Jan 20 │ │
│ │ │ │
│ │ ▓ Triggers ░ Created Line: Reduction % │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ TOP NOISE SOURCES │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ By Monitor By Severity │
│ ┌────────────────────────────────────┐ ┌────────────────────────────────────────┐ │
│ │ 1. mysql-prod 500 alerts │ │ 1. Warning 600 alerts │ │
│ │ ████████████████████ (300 dupe)│ │ ████████████████████████ │ │
│ │ │ │ │ │
│ │ 2. api-gateway 350 alerts │ │ 2. High 400 alerts │ │
│ │ ██████████████ (150 dupe) │ │ ████████████████ │ │
│ │ │ │ │ │
│ │ 3. redis-cluster 280 alerts │ │ 3. Critical 200 alerts │ │
│ │ ███████████ (180 dupe) │ │ ████████ │ │
│ └────────────────────────────────────┘ └────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
## Components to Create
### 1. StormStatusBanner
**File:** `/Dashboard/src/Components/Storm/StormStatusBanner.tsx`
Global banner that appears when a storm is active.
```typescript
interface StormStatusBannerProps {
stormEvent: AlertStormEvent | null;
onDismiss?: () => void;
onViewDetails?: () => void;
}
```
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ ⚠️ ALERT STORM DETECTED - 150 alerts/hour (5x normal) - 12 monitors affected │
│ [View Details] [Dismiss] │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
### 2. StormSeverityBadge
**File:** `/Dashboard/src/Components/Storm/StormSeverityBadge.tsx`
Badge showing storm severity level.
```typescript
interface StormSeverityBadgeProps {
severity: 'normal' | 'elevated' | 'storm' | 'critical';
showLabel?: boolean;
}
```
### 3. AlertVolumeChart
**File:** `/Dashboard/src/Components/Storm/AlertVolumeChart.tsx`
Line chart showing alert volume over time.
```typescript
interface AlertVolumeChartProps {
data: Array<{
timestamp: Date;
alertCount: number;
baseline?: number;
}>;
showBaseline?: boolean;
highlightStormPeriods?: boolean;
}
```
### 4. NoiseReductionSummaryCard
**File:** `/Dashboard/src/Components/Analytics/NoiseReductionSummaryCard.tsx`
Card showing noise reduction summary statistics.
```typescript
interface NoiseReductionSummaryCardProps {
totalTriggers: number;
alertsCreated: number;
deduplicated: number;
suppressed: number;
grouped: number;
reductionPercent: number;
}
```
### 5. NoiseReductionBreakdownChart
**File:** `/Dashboard/src/Components/Analytics/NoiseReductionBreakdownChart.tsx`
Stacked bar or donut chart showing reduction breakdown.
```typescript
interface NoiseReductionBreakdownChartProps {
deduplicated: number;
suppressed: number;
grouped: number;
created: number;
}
```
### 6. TopNoiseSourcesTable
**File:** `/Dashboard/src/Components/Analytics/TopNoiseSourcesTable.tsx`
Table showing top noise-generating monitors or services.
```typescript
interface TopNoiseSourcesTableProps {
sources: Array<{
id: string;
name: string;
alertCount: number;
duplicateCount: number;
}>;
groupBy: 'monitor' | 'severity' | 'service';
}
```
### 7. DailyMetricsChart
**File:** `/Dashboard/src/Components/Analytics/DailyMetricsChart.tsx`
Bar chart showing daily noise reduction metrics.
```typescript
interface DailyMetricsChartProps {
data: Array<{
date: string;
totalTriggers: number;
alertsCreated: number;
reductionPercent: number;
}>;
}
```
### 8. AffectedMonitorsTable
**File:** `/Dashboard/src/Components/Storm/AffectedMonitorsTable.tsx`
Table showing monitors contributing to a storm.
```typescript
interface AffectedMonitorsTableProps {
monitors: Array<{
monitorId: string;
monitorName: string;
alertCount: number;
percentage: number;
}>;
}
```
---
## Routing Configuration
Add to route configuration:
```typescript
// Storm Detection Settings
{
path: '/dashboard/:projectId/settings/alert-storm-detection',
component: AlertStormDetectionSettingsPage,
}
// Storm History
{
path: '/dashboard/:projectId/alerts/storm-history',
component: StormHistoryPage,
}
// Storm Event Detail
{
path: '/dashboard/:projectId/alerts/storm-history/:stormEventId',
component: StormEventDetailPage,
}
// Noise Reduction Analytics
{
path: '/dashboard/:projectId/analytics/noise-reduction',
component: NoiseReductionDashboardPage,
}
```
---
## Global Integration
### Dashboard Header Integration
Add storm status indicator to main dashboard header:
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ OneUptime [Projects ▼] Alerts Monitors Status Pages 🔴 Storm Active │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
### Alerts Page Integration
Add storm status banner above alerts list when storm is active:
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Alerts │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ ⚠️ Alert Storm Active - Click to view details │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ [Normal alerts table continues below...] │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
## Implementation Checklist
### Pages
- [ ] Storm Detection settings page
- [ ] Storm History list page
- [ ] Storm Event detail page
- [ ] Noise Reduction Analytics dashboard
### Components
- [ ] StormStatusBanner
- [ ] StormSeverityBadge
- [ ] AlertVolumeChart
- [ ] NoiseReductionSummaryCard
- [ ] NoiseReductionBreakdownChart
- [ ] TopNoiseSourcesTable
- [ ] DailyMetricsChart
- [ ] AffectedMonitorsTable
### Global Integrations
- [ ] Add storm indicator to dashboard header
- [ ] Add storm banner to Alerts page
- [ ] Add sidebar navigation items
### Styling
- [ ] Storm severity color scheme (yellow/orange/red)
- [ ] Chart styling for analytics
- [ ] Banner animation styles
- [ ] Responsive layouts
### Data Fetching
- [ ] Storm status polling (every 30 seconds when on dashboard)
- [ ] Storm events API integration
- [ ] Noise reduction metrics API integration
- [ ] WebSocket support for real-time storm updates (optional)

View File

@@ -0,0 +1,159 @@
# Alert Storm Detection Implementation Plan
## Overview
This sub-plan details the implementation of Alert Storm Detection and Analytics functionality for OneUptime. This feature detects when alert volume spikes abnormally and provides noise reduction analytics.
## Documents
| Document | Description |
|----------|-------------|
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
| [2-Backend.md](./2-Backend.md) | Backend services and storm detector |
| [3-API.md](./3-API.md) | REST API endpoints |
| [4-UI.md](./4-UI.md) | Frontend components and pages |
## Feature Summary
### What is Alert Storm Detection?
Alert Storm Detection identifies when the rate of incoming alerts significantly exceeds normal patterns. This helps operators understand when something unusual is happening and optionally enables automatic suppression during storms.
### Storm Detection Logic
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ Storm Detection Algorithm │
└─────────────────────────────────────────────────────────────────────────────────┘
Current Hour Historical Average Storm Check
┌─────────────────┐ ┌─────────────────────┐ ┌─────────────┐
│ │ │ │ │ │
│ 150 alerts │ vs │ 30 alerts/hour │ = │ 5x normal │
│ (this hour) │ │ (last 24h avg) │ │ = STORM! │
│ │ │ │ │ │
└─────────────────┘ └─────────────────────┘ └─────────────┘
```
### Key Capabilities
1. **Storm Detection** - Identify abnormal alert spikes
2. **Historical Analysis** - Compare against baseline patterns
3. **Storm Alerts** - Notify admins when storm detected
4. **Emergency Suppression** - Optional auto-suppression during storms
5. **Noise Reduction Analytics** - Track overall noise reduction metrics
6. **Top Alerting Sources** - Identify which monitors/services cause most noise
### Storm Thresholds
| Level | Multiplier | Description |
|-------|------------|-------------|
| Normal | < 2x | Within normal variance |
| Elevated | 2x - 3x | Higher than usual |
| Storm | 3x - 5x | Significant spike |
| Critical Storm | > 5x | Major incident likely |
### User Stories
```
As an SRE, I want to be notified when an alert storm starts
so that I know something significant is happening.
As an operator, I want to see which monitors are causing the most alerts
so that I can prioritize investigation.
As a team lead, I want to see noise reduction metrics
so that I can measure the effectiveness of our alert tuning.
As an admin, I want to enable emergency suppression during storms
so that my team isn't overwhelmed during major incidents.
```
## Implementation Phases
### Phase 1: Storm Detection Core (Week 1)
- [ ] Create AlertStormEvent model
- [ ] Implement StormDetector service
- [ ] Create storm monitoring worker job
- [ ] Add storm detection settings
### Phase 2: Storm Notifications (Week 2)
- [ ] Storm start/end notifications
- [ ] Top alerting monitors identification
- [ ] Storm event timeline
- [ ] Admin notifications
### Phase 3: Noise Reduction Analytics (Week 3)
- [ ] Create NoiseReductionMetric model
- [ ] Daily metrics calculation job
- [ ] Deduplication statistics
- [ ] Suppression statistics
- [ ] Grouping statistics
### Phase 4: UI Dashboard (Week 4)
- [ ] Storm status banner
- [ ] Noise reduction dashboard
- [ ] Alert volume charts
- [ ] Top alerting sources view
## Architecture
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ Storm Detection Flow │
└─────────────────────────────────────────────────────────────────────────────────┘
┌───────────────────┐
│ Worker Job │
│ (Every 5 min) │
└─────────┬─────────┘
┌───────────────────────────────────────────────────────────────────────────┐
│ StormDetector.checkStatus() │
├───────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────────┐ │
│ │ Get current │──▶│ Get historical │──▶│ Calculate multiplier │ │
│ │ hour count │ │ average │ │ current / historical │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────────────┘ │
│ │ │
│ ┌─────────┴─────────┐ │
│ │ │ │
│ ▼ ▼ │
│ ┌─────────────────┐ ┌─────────────────┐│
│ │ multiplier < 3 │ │ multiplier >= 3 ││
│ │ = Normal │ │ = STORM ││
│ └─────────────────┘ └────────┬────────┘│
│ │ │
└──────────────────────────────────────────────────────────────────┼─────────┘
┌─────────────────────────┐
│ Storm Actions: │
│ - Create AlertStormEvent│
│ - Notify admins │
│ - Show banner │
│ - Optional: auto-suppress│
└─────────────────────────┘
```
## Success Metrics
| Metric | Target |
|--------|--------|
| Storm detection accuracy | > 95% |
| Detection latency | < 5 minutes |
| False positive rate | < 5% |
| Noise reduction visibility | 100% of projects |
## References
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
- [Alert Grouping Plan](../AlertGrouping/README.md)
- [Alert Suppression Plan](../AlertSuppression/README.md)
- [Alert Deduplication Plan](../AlertDeduplication/README.md)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,909 @@
# Backend Implementation for Alert Suppression
## Overview
This document details the backend services and components required for Alert Suppression functionality.
## Architecture
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ Suppression Evaluation Flow │
└─────────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────┐
│ Alert Trigger │
│ (Monitor/Manual) │
└──────────┬───────────┘
┌──────────────────────────────────────────────────────────────────────────────────┐
│ SuppressionEngine.evaluate() │
├──────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────┐ │
│ │ 1. Get enabled │ │
│ │ rules │ │
│ └────────┬────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────────────┐ │
│ │ 2. For each rule (sorted by priority): │ │
│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │
│ │ │ Match Criteria│─▶│ Check if Rule │─▶│ Apply Action │ │ │
│ │ │ Evaluation │ │ is Active │ │ │ │ │
│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │
│ │ │ │ │
│ │ ┌───────────────┼───────────────┐ │ │
│ │ ▼ ▼ ▼ │ │
│ │ ┌─────────────────┐ ┌──────────────┐ ┌─────────────────┐ │ │
│ │ │ Maintenance │ │ Condition │ │ Rate Limit │ │ │
│ │ │ Window Active? │ │ Met? │ │ Exceeded? │ │ │
│ │ └─────────────────┘ └──────────────┘ └─────────────────┘ │ │
│ └─────────────────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────┐ │
│ │ 3. Determine │ │
│ │ final action │ │
│ └─────────────────┘ │
│ │
└──────────────────────────────────────────────────────────────────────────────────┘
┌──────┴──────┐
│ │
▼ ▼
┌────────┐ ┌────────┐
│SUPPRESS│ │ ALLOW │
└────────┘ └────────┘
```
---
## Services to Create
### 1. AlertSuppressionRuleService
**File Location:** `/Common/Server/Services/AlertSuppressionRuleService.ts`
```typescript
import DatabaseService from './DatabaseService';
import AlertSuppressionRule, {
SuppressionRuleType,
} from '../Models/DatabaseModels/AlertSuppressionRule';
import ObjectID from 'Common/Types/ObjectID';
import SortOrder from 'Common/Types/BaseDatabase/SortOrder';
export class Service extends DatabaseService<AlertSuppressionRule> {
public constructor() {
super(AlertSuppressionRule);
}
/**
* Get all enabled rules for a project, sorted by priority
*/
public async getEnabledRulesForProject(
projectId: ObjectID
): Promise<Array<AlertSuppressionRule>> {
return await this.findBy({
query: {
projectId,
isEnabled: true,
},
select: {
_id: true,
name: true,
type: true,
matchCriteria: true,
maintenanceWindow: true,
condition: true,
rateLimit: true,
action: true,
suppressionGroupId: true,
priority: true,
},
sort: { priority: SortOrder.Ascending },
props: { isRoot: true },
});
}
/**
* Get active maintenance windows
*/
public async getActiveMaintenanceWindows(
projectId: ObjectID
): Promise<Array<AlertSuppressionRule>> {
const rules = await this.getEnabledRulesForProject(projectId);
return rules.filter((rule) => {
if (rule.type !== SuppressionRuleType.MaintenanceWindow) {
return false;
}
return this.isMaintenanceWindowActive(rule);
});
}
/**
* Check if a maintenance window is currently active
*/
private isMaintenanceWindowActive(rule: AlertSuppressionRule): boolean {
const window = rule.maintenanceWindow;
if (!window) {
return false;
}
const now = new Date();
if (window.isRecurring && window.recurrenceRule) {
return this.evaluateRecurrence(window, now);
}
return now >= window.startTime && now <= window.endTime;
}
/**
* Evaluate recurrence rule (RRULE format)
*/
private evaluateRecurrence(
window: MaintenanceWindowConfig,
now: Date
): boolean {
// Use rrule library for parsing
// This is a simplified implementation
try {
const RRule = require('rrule').RRule;
const rule = RRule.fromString(window.recurrenceRule!);
// Get next occurrence
const nextOccurrence = rule.after(
new Date(now.getTime() - 24 * 60 * 60 * 1000), // Look back 24h
true
);
if (!nextOccurrence) {
return false;
}
// Calculate window duration
const duration = window.endTime.getTime() - window.startTime.getTime();
const occurrenceEnd = new Date(nextOccurrence.getTime() + duration);
return now >= nextOccurrence && now <= occurrenceEnd;
} catch (error) {
logger.error('Error evaluating recurrence rule:', error);
return false;
}
}
/**
* Increment suppressed count for a rule
*/
public async incrementSuppressedCount(ruleId: ObjectID): Promise<void> {
await this.updateOneById({
id: ruleId,
data: {
suppressedCount: QueryHelper.increment(1),
lastTriggeredAt: new Date(),
},
props: { isRoot: true },
});
}
}
export default new Service();
```
---
### 2. SuppressionEngine
**File Location:** `/Common/Server/Utils/Alert/SuppressionEngine.ts`
```typescript
import Alert from '../../Models/DatabaseModels/Alert';
import AlertSuppressionRule, {
SuppressionRuleType,
SuppressionAction,
SuppressionMatchCriteria,
RateLimitConfig,
} from '../../Models/DatabaseModels/AlertSuppressionRule';
import AlertSuppressionRuleService from '../../Services/AlertSuppressionRuleService';
import AlertThrottleStateService from '../../Services/AlertThrottleStateService';
import SuppressedAlertLogService from '../../Services/SuppressedAlertLogService';
import ObjectID from 'Common/Types/ObjectID';
import OneUptimeDate from 'Common/Types/Date';
export interface SuppressionResult {
shouldSuppress: boolean;
action: SuppressionAction | 'none';
matchedRules: Array<AlertSuppressionRule>;
reason?: string;
}
export default class SuppressionEngine {
/**
* Evaluate all suppression rules for an alert
*/
public static async evaluate(
alertData: Partial<Alert>,
projectId: ObjectID
): Promise<SuppressionResult> {
// Get all enabled suppression rules
const rules = await AlertSuppressionRuleService.getEnabledRulesForProject(
projectId
);
if (rules.length === 0) {
return {
shouldSuppress: false,
action: 'none',
matchedRules: [],
};
}
const matchedRules: Array<AlertSuppressionRule> = [];
let shouldSuppress = false;
let finalAction: SuppressionAction | 'none' = 'none';
let reason = '';
// Evaluate each rule in priority order
for (const rule of rules) {
// Check if alert matches rule criteria
if (!await this.matchesCriteria(alertData, rule.matchCriteria)) {
continue;
}
// Check if rule is currently active
const isActive = await this.isRuleActive(rule, alertData, projectId);
if (isActive) {
matchedRules.push(rule);
shouldSuppress = true;
// Determine action (most restrictive wins)
if (rule.action === SuppressionAction.Both || finalAction === SuppressionAction.Both) {
finalAction = SuppressionAction.Both;
} else if (rule.action === SuppressionAction.SuppressCreation) {
finalAction = SuppressionAction.SuppressCreation;
} else if (finalAction !== SuppressionAction.SuppressCreation) {
finalAction = SuppressionAction.SuppressNotifications;
}
// Build reason string
reason = this.buildSuppressionReason(rule);
// If suppressing creation, no need to check more rules
if (finalAction === SuppressionAction.SuppressCreation ||
finalAction === SuppressionAction.Both) {
break;
}
}
}
// Log suppression if applicable
if (shouldSuppress && matchedRules.length > 0) {
await this.logSuppression(alertData, matchedRules[0]!, projectId, reason, finalAction);
}
return {
shouldSuppress,
action: finalAction,
matchedRules,
reason,
};
}
/**
* Check if alert matches rule criteria
*/
private static async matchesCriteria(
alertData: Partial<Alert>,
criteria?: SuppressionMatchCriteria
): Promise<boolean> {
if (!criteria || criteria.matchAll) {
return true;
}
// Check severity
if (criteria.severityIds?.length) {
const alertSeverityId = alertData.alertSeverityId?.toString();
if (!alertSeverityId || !criteria.severityIds.includes(alertSeverityId)) {
return false;
}
}
// Check monitors
if (criteria.monitorIds?.length) {
const alertMonitorId = alertData.monitorId?.toString();
if (!alertMonitorId || !criteria.monitorIds.includes(alertMonitorId)) {
return false;
}
}
// Check labels
if (criteria.labelIds?.length) {
const alertLabelIds = (alertData.labels || []).map((l) =>
l.id?.toString() || l._id?.toString()
);
const hasMatchingLabel = criteria.labelIds.some((id) =>
alertLabelIds.includes(id)
);
if (!hasMatchingLabel) {
return false;
}
}
// Check title pattern
if (criteria.titlePattern) {
try {
const regex = new RegExp(criteria.titlePattern, 'i');
if (!regex.test(alertData.title || '')) {
return false;
}
} catch {
// Invalid regex, skip this check
}
}
// Check description pattern
if (criteria.descriptionPattern) {
try {
const regex = new RegExp(criteria.descriptionPattern, 'i');
if (!regex.test(alertData.description || '')) {
return false;
}
} catch {
// Invalid regex, skip this check
}
}
return true;
}
/**
* Check if rule is currently active
*/
private static async isRuleActive(
rule: AlertSuppressionRule,
alertData: Partial<Alert>,
projectId: ObjectID
): Promise<boolean> {
switch (rule.type) {
case SuppressionRuleType.MaintenanceWindow:
return this.isMaintenanceWindowActive(rule);
case SuppressionRuleType.ConditionBased:
return await this.isConditionMet(rule, projectId);
case SuppressionRuleType.RateLimit:
return await this.isRateLimitExceeded(rule, alertData, projectId);
default:
return false;
}
}
/**
* Check if maintenance window is active
*/
private static isMaintenanceWindowActive(rule: AlertSuppressionRule): boolean {
const window = rule.maintenanceWindow;
if (!window) {
return false;
}
const now = new Date();
if (window.isRecurring && window.recurrenceRule) {
return this.evaluateRecurrence(window, now);
}
return now >= window.startTime && now <= window.endTime;
}
/**
* Evaluate recurrence rule
*/
private static evaluateRecurrence(
window: MaintenanceWindowConfig,
now: Date
): boolean {
try {
const RRule = require('rrule').RRule;
const rule = RRule.fromString(window.recurrenceRule!);
const lookbackTime = new Date(now.getTime() - 24 * 60 * 60 * 1000);
const nextOccurrence = rule.after(lookbackTime, true);
if (!nextOccurrence) {
return false;
}
const duration = window.endTime.getTime() - window.startTime.getTime();
const occurrenceEnd = new Date(nextOccurrence.getTime() + duration);
return now >= nextOccurrence && now <= occurrenceEnd;
} catch {
return false;
}
}
/**
* Check if condition is met
*/
private static async isConditionMet(
rule: AlertSuppressionRule,
projectId: ObjectID
): Promise<boolean> {
const condition = rule.condition;
if (!condition) {
return true;
}
// Check if another alert is active with specific labels
if (condition.whenAlertActiveWithLabelIds?.length) {
const activeAlert = await AlertService.findOneBy({
query: {
projectId,
labels: QueryHelper.any(condition.whenAlertActiveWithLabelIds),
currentAlertStateId: QueryHelper.notEquals(
await AlertStateService.getResolvedStateId(projectId)
),
},
select: { _id: true },
props: { isRoot: true },
});
if (activeAlert) {
return true;
}
}
// Check if monitor is in specific state
if (condition.whenMonitorInStateIds?.length) {
// Implementation depends on monitor state tracking
}
return false;
}
/**
* Check if rate limit is exceeded
*/
private static async isRateLimitExceeded(
rule: AlertSuppressionRule,
alertData: Partial<Alert>,
projectId: ObjectID
): Promise<boolean> {
const rateLimit = rule.rateLimit;
if (!rateLimit) {
return false;
}
// Compute throttle key
const throttleKey = this.computeThrottleKey(rule, alertData);
// Get or create throttle state
let state = await AlertThrottleStateService.findOneBy({
query: {
throttleKey,
suppressionRuleId: rule.id!,
windowExpiresAt: QueryHelper.greaterThan(new Date()),
},
select: {
_id: true,
alertCount: true,
isThrottling: true,
},
props: { isRoot: true },
});
if (!state) {
// Create new throttle state
const now = new Date();
const windowExpires = OneUptimeDate.addRemoveMinutes(
now,
rateLimit.timeWindowMinutes
);
await AlertThrottleStateService.create({
data: {
projectId,
throttleKey,
suppressionRuleId: rule.id!,
alertCount: 1,
firstAlertAt: now,
lastAlertAt: now,
windowExpiresAt: windowExpires,
isThrottling: false,
} as AlertThrottleState,
props: { isRoot: true },
});
return false;
}
// Update throttle state
const newCount = (state.alertCount || 0) + 1;
const shouldThrottle = newCount > rateLimit.maxAlerts;
await AlertThrottleStateService.updateOneById({
id: state.id!,
data: {
alertCount: newCount,
lastAlertAt: new Date(),
isThrottling: shouldThrottle,
},
props: { isRoot: true },
});
return shouldThrottle;
}
/**
* Compute throttle key from rule and alert data
*/
private static computeThrottleKey(
rule: AlertSuppressionRule,
alertData: Partial<Alert>
): string {
const parts: Array<string> = [`rule:${rule.id?.toString()}`];
const groupByFields = rule.rateLimit?.groupByFields || [];
for (const field of groupByFields) {
switch (field) {
case 'monitorId':
parts.push(`monitor:${alertData.monitorId?.toString() || 'null'}`);
break;
case 'alertSeverityId':
case 'severity':
parts.push(`severity:${alertData.alertSeverityId?.toString() || 'null'}`);
break;
case 'title':
parts.push(`title:${alertData.title || 'null'}`);
break;
}
}
return parts.join('|');
}
/**
* Build suppression reason string
*/
private static buildSuppressionReason(rule: AlertSuppressionRule): string {
switch (rule.type) {
case SuppressionRuleType.MaintenanceWindow:
return `Suppressed by maintenance window: ${rule.name}`;
case SuppressionRuleType.ConditionBased:
return `Suppressed by condition: ${rule.name}`;
case SuppressionRuleType.RateLimit:
return `Suppressed by rate limit: ${rule.name} (max ${rule.rateLimit?.maxAlerts} per ${rule.rateLimit?.timeWindowMinutes} min)`;
default:
return `Suppressed by rule: ${rule.name}`;
}
}
/**
* Log suppressed alert for audit trail
*/
private static async logSuppression(
alertData: Partial<Alert>,
rule: AlertSuppressionRule,
projectId: ObjectID,
reason: string,
action: SuppressionAction | 'none'
): Promise<void> {
await SuppressedAlertLogService.create({
data: {
projectId,
suppressionRuleId: rule.id,
alertData: alertData as object,
alertTitle: alertData.title,
suppressionReason: reason,
action: action as SuppressionAction,
suppressedAt: new Date(),
monitorId: alertData.monitorId,
} as SuppressedAlertLog,
props: { isRoot: true },
});
// Increment rule counter
await AlertSuppressionRuleService.incrementSuppressedCount(rule.id!);
}
}
// Import services at end to avoid circular dependencies
import AlertService from '../../Services/AlertService';
import AlertStateService from '../../Services/AlertStateService';
import AlertThrottleState from '../../Models/DatabaseModels/AlertThrottleState';
import SuppressedAlertLog from '../../Models/DatabaseModels/SuppressedAlertLog';
import QueryHelper from '../../Types/Database/QueryHelper';
import logger from '../../Utils/Logger';
import { MaintenanceWindowConfig } from '../../Models/DatabaseModels/AlertSuppressionRule';
```
---
### 3. Integration with AlertService
Modify `/Common/Server/Services/AlertService.ts`:
```typescript
// Add import
import SuppressionEngine from '../Utils/Alert/SuppressionEngine';
// In onBeforeCreate() method, add suppression check:
protected async onBeforeCreate(
createBy: CreateBy<Alert>
): Promise<OnCreate<Alert>> {
// ... existing code ...
// Check suppression rules
const suppressionResult = await SuppressionEngine.evaluate(
createBy.data,
createBy.data.projectId!
);
if (suppressionResult.shouldSuppress) {
if (suppressionResult.action === SuppressionAction.SuppressCreation ||
suppressionResult.action === SuppressionAction.Both) {
// Prevent alert creation
throw new SuppressedAlertException(
suppressionResult.reason || 'Alert suppressed by rule'
);
}
// Mark for notification suppression
createBy.data.notificationsSuppressed = true;
createBy.data.suppressedByRuleId = suppressionResult.matchedRules[0]?.id;
}
// ... rest of existing code ...
}
```
---
### 4. SuppressedAlertLogService
**File Location:** `/Common/Server/Services/SuppressedAlertLogService.ts`
```typescript
import DatabaseService from './DatabaseService';
import SuppressedAlertLog from '../Models/DatabaseModels/SuppressedAlertLog';
import ObjectID from 'Common/Types/ObjectID';
export class Service extends DatabaseService<SuppressedAlertLog> {
public constructor() {
super(SuppressedAlertLog);
}
/**
* Get suppressed alerts for a rule
*/
public async getSuppressedByRule(
ruleId: ObjectID,
limit: number = 100
): Promise<Array<SuppressedAlertLog>> {
return await this.findBy({
query: { suppressionRuleId: ruleId },
select: {
_id: true,
alertTitle: true,
suppressionReason: true,
action: true,
suppressedAt: true,
monitorId: true,
},
sort: { suppressedAt: SortOrder.Descending },
limit,
props: { isRoot: true },
});
}
/**
* Get suppression statistics for a project
*/
public async getStatistics(
projectId: ObjectID,
startDate: Date,
endDate: Date
): Promise<{
totalSuppressed: number;
byRule: Array<{ ruleId: string; count: number }>;
byAction: Array<{ action: string; count: number }>;
}> {
const totalSuppressed = await this.countBy({
query: {
projectId,
suppressedAt: QueryHelper.between(startDate, endDate),
},
props: { isRoot: true },
});
// Aggregate by rule
const byRule = await this.aggregate({
pipeline: [
{
$match: {
projectId: projectId.toString(),
suppressedAt: { $gte: startDate, $lte: endDate },
},
},
{
$group: {
_id: '$suppressionRuleId',
count: { $sum: 1 },
},
},
],
});
// Aggregate by action
const byAction = await this.aggregate({
pipeline: [
{
$match: {
projectId: projectId.toString(),
suppressedAt: { $gte: startDate, $lte: endDate },
},
},
{
$group: {
_id: '$action',
count: { $sum: 1 },
},
},
],
});
return {
totalSuppressed,
byRule: byRule.map((r) => ({ ruleId: r._id, count: r.count })),
byAction: byAction.map((a) => ({ action: a._id, count: a.count })),
};
}
}
export default new Service();
```
---
### 5. AlertThrottleStateService
**File Location:** `/Common/Server/Services/AlertThrottleStateService.ts`
```typescript
import DatabaseService from './DatabaseService';
import AlertThrottleState from '../Models/DatabaseModels/AlertThrottleState';
export class Service extends DatabaseService<AlertThrottleState> {
public constructor() {
super(AlertThrottleState);
}
/**
* Clean up expired throttle states
*/
public async cleanupExpired(): Promise<number> {
const result = await this.deleteBy({
query: {
windowExpiresAt: QueryHelper.lessThan(new Date()),
},
props: { isRoot: true },
});
return result;
}
}
export default new Service();
```
---
## Worker Jobs
### 1. ThrottleStateCleanup Job
**File Location:** `/Worker/Jobs/AlertSuppression/ThrottleStateCleanup.ts`
```typescript
import RunCron from '../../Utils/Cron';
import { EVERY_HOUR } from 'Common/Utils/CronTime';
import AlertThrottleStateService from 'Common/Server/Services/AlertThrottleStateService';
RunCron(
'AlertSuppression:ThrottleStateCleanup',
{ schedule: EVERY_HOUR, runOnStartup: false },
async () => {
const deletedCount = await AlertThrottleStateService.cleanupExpired();
if (deletedCount > 0) {
logger.info(`Cleaned up ${deletedCount} expired throttle states`);
}
}
);
```
### 2. MaintenanceWindowNotification Job
**File Location:** `/Worker/Jobs/AlertSuppression/MaintenanceWindowNotification.ts`
```typescript
import RunCron from '../../Utils/Cron';
import { EVERY_MINUTE } from 'Common/Utils/CronTime';
import AlertSuppressionRuleService from 'Common/Server/Services/AlertSuppressionRuleService';
import { SuppressionRuleType } from 'Common/Models/DatabaseModels/AlertSuppressionRule';
RunCron(
'AlertSuppression:MaintenanceWindowNotification',
{ schedule: EVERY_MINUTE, runOnStartup: false },
async () => {
// Find maintenance windows starting in next 15 minutes
const upcomingWindows = await AlertSuppressionRuleService.findBy({
query: {
type: SuppressionRuleType.MaintenanceWindow,
isEnabled: true,
},
select: {
_id: true,
projectId: true,
name: true,
maintenanceWindow: true,
},
props: { isRoot: true },
});
const now = new Date();
const fifteenMinutesFromNow = new Date(now.getTime() + 15 * 60 * 1000);
for (const rule of upcomingWindows) {
const window = rule.maintenanceWindow;
if (!window) continue;
// Check if window starts within next 15 minutes
if (window.startTime > now && window.startTime <= fifteenMinutesFromNow) {
// Send notification about upcoming maintenance window
await NotificationService.sendMaintenanceWindowNotification({
projectId: rule.projectId!,
ruleName: rule.name!,
startsAt: window.startTime,
endsAt: window.endTime,
});
}
}
}
);
```
---
## Implementation Checklist
### Phase 1: Core Services
- [ ] Create AlertSuppressionRuleService
- [ ] Create AlertSuppressionGroupService
- [ ] Create SuppressedAlertLogService
- [ ] Create AlertThrottleStateService
- [ ] Create SuppressionEngine
### Phase 2: Integration
- [ ] Modify AlertService.onBeforeCreate()
- [ ] Add SuppressedAlertException
- [ ] Add notification suppression field to Alert
### Phase 3: Worker Jobs
- [ ] Create ThrottleStateCleanup job
- [ ] Create MaintenanceWindowNotification job
- [ ] Register jobs in worker startup
### Phase 4: Testing
- [ ] Unit tests for SuppressionEngine
- [ ] Unit tests for criteria matching
- [ ] Unit tests for rate limiting
- [ ] Integration tests for full suppression flow

View File

@@ -0,0 +1,499 @@
# API Design for Alert Suppression
## Overview
This document defines the REST API endpoints for Alert Suppression functionality.
## Base URLs
```
/api/project/{projectId}/alert-suppression-rule
/api/project/{projectId}/alert-suppression-group
/api/project/{projectId}/suppressed-alert-log
/api/project/{projectId}/maintenance-window
```
---
## Suppression Rules API
### List Suppression Rules
```http
GET /api/project/{projectId}/alert-suppression-rule
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `type` | string | Filter by rule type (maintenance_window, condition_based, rate_limit) |
| `isEnabled` | boolean | Filter by enabled status |
| `suppressionGroupId` | ObjectID | Filter by suppression group |
| `limit` | number | Results per page |
| `skip` | number | Pagination offset |
**Response:**
```json
{
"data": [
{
"_id": "rule-id-1",
"name": "Nightly Maintenance Window",
"description": "Suppress alerts during nightly deployments",
"type": "maintenance_window",
"isEnabled": true,
"priority": 1,
"matchCriteria": {
"matchAll": true
},
"maintenanceWindow": {
"startTime": "2026-01-20T02:00:00Z",
"endTime": "2026-01-20T04:00:00Z",
"timezone": "America/Los_Angeles",
"isRecurring": true,
"recurrenceRule": "FREQ=DAILY"
},
"action": "both",
"suppressedCount": 156,
"lastTriggeredAt": "2026-01-20T02:15:00Z"
},
{
"_id": "rule-id-2",
"name": "Rate Limit - 10/hour per monitor",
"type": "rate_limit",
"isEnabled": true,
"priority": 2,
"matchCriteria": {},
"rateLimit": {
"maxAlerts": 10,
"timeWindowMinutes": 60,
"groupByFields": ["monitorId"]
},
"action": "suppress_creation",
"suppressedCount": 523
}
],
"count": 5,
"skip": 0,
"limit": 10
}
```
---
### Get Suppression Rule
```http
GET /api/project/{projectId}/alert-suppression-rule/{ruleId}
```
---
### Create Suppression Rule
```http
POST /api/project/{projectId}/alert-suppression-rule
```
**Request Body (Maintenance Window):**
```json
{
"name": "Weekend Maintenance",
"description": "Suppress alerts during weekend maintenance",
"type": "maintenance_window",
"isEnabled": true,
"priority": 1,
"matchCriteria": {
"labelIds": ["production-label-id"]
},
"maintenanceWindow": {
"startTime": "2026-01-25T00:00:00Z",
"endTime": "2026-01-25T06:00:00Z",
"timezone": "America/New_York",
"isRecurring": true,
"recurrenceRule": "FREQ=WEEKLY;BYDAY=SA,SU"
},
"action": "both"
}
```
**Request Body (Rate Limit):**
```json
{
"name": "Alert Storm Protection",
"description": "Limit alerts to 20 per hour per monitor",
"type": "rate_limit",
"isEnabled": true,
"priority": 10,
"matchCriteria": {
"severityIds": ["warning-id", "info-id"]
},
"rateLimit": {
"maxAlerts": 20,
"timeWindowMinutes": 60,
"groupByFields": ["monitorId"]
},
"action": "suppress_creation"
}
```
**Request Body (Condition-Based):**
```json
{
"name": "Suppress Staging Alerts",
"description": "Suppress notifications for staging environment",
"type": "condition_based",
"isEnabled": true,
"priority": 5,
"matchCriteria": {
"labelIds": ["staging-label-id"]
},
"condition": {},
"action": "suppress_notifications"
}
```
---
### Update Suppression Rule
```http
PUT /api/project/{projectId}/alert-suppression-rule/{ruleId}
```
---
### Delete Suppression Rule
```http
DELETE /api/project/{projectId}/alert-suppression-rule/{ruleId}
```
---
### Enable/Disable Rule
```http
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/enable
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/disable
```
---
### Test Suppression Rule
Test which alerts would be suppressed by a rule.
```http
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/test
```
**Request Body:**
```json
{
"alertIds": ["alert-id-1", "alert-id-2", "alert-id-3"]
}
```
**Response:**
```json
{
"results": [
{
"alertId": "alert-id-1",
"alertTitle": "MySQL connection timeout",
"wouldSuppress": true,
"action": "both",
"reason": "Matches criteria and maintenance window is active"
},
{
"alertId": "alert-id-2",
"alertTitle": "API latency high",
"wouldSuppress": false,
"reason": "Does not match severity criteria"
}
]
}
```
---
## Maintenance Windows API
Convenience endpoints for maintenance windows specifically.
### List Active Maintenance Windows
```http
GET /api/project/{projectId}/maintenance-window/active
```
**Response:**
```json
{
"data": [
{
"_id": "rule-id-1",
"name": "Nightly Maintenance",
"startedAt": "2026-01-20T02:00:00Z",
"endsAt": "2026-01-20T04:00:00Z",
"remainingMinutes": 45,
"matchCriteria": { "matchAll": true }
}
]
}
```
### List Upcoming Maintenance Windows
```http
GET /api/project/{projectId}/maintenance-window/upcoming
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `hours` | number | Look ahead hours (default: 24) |
---
### Quick Create Maintenance Window
Simplified endpoint for creating one-time maintenance windows.
```http
POST /api/project/{projectId}/maintenance-window/quick
```
**Request Body:**
```json
{
"name": "Emergency Deployment",
"durationMinutes": 60,
"matchCriteria": {
"monitorIds": ["monitor-1", "monitor-2"]
}
}
```
Creates a maintenance window starting immediately.
---
## Suppression Groups API
### List Suppression Groups
```http
GET /api/project/{projectId}/alert-suppression-group
```
### Create Suppression Group
```http
POST /api/project/{projectId}/alert-suppression-group
```
**Request Body:**
```json
{
"name": "Database Alerts",
"description": "Group for database-related suppression rules",
"throttleMinutes": 30
}
```
### Get Group with Rules
```http
GET /api/project/{projectId}/alert-suppression-group/{groupId}/rules
```
---
## Suppressed Alert Log API
### List Suppressed Alerts
```http
GET /api/project/{projectId}/suppressed-alert-log
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `suppressionRuleId` | ObjectID | Filter by rule |
| `monitorId` | ObjectID | Filter by monitor |
| `action` | string | Filter by action |
| `suppressedAt` | DateRange | Filter by date |
| `limit` | number | Results per page |
| `skip` | number | Pagination offset |
**Response:**
```json
{
"data": [
{
"_id": "log-id-1",
"alertTitle": "MySQL connection timeout",
"suppressionRule": {
"_id": "rule-id",
"name": "Nightly Maintenance"
},
"suppressionReason": "Suppressed by maintenance window: Nightly Maintenance",
"action": "both",
"suppressedAt": "2026-01-20T02:15:00Z",
"monitor": {
"_id": "monitor-id",
"name": "MySQL Production"
},
"alertData": {
"title": "MySQL connection timeout",
"description": "Connection to MySQL timed out after 30s",
"severity": "High"
}
}
],
"count": 156,
"skip": 0,
"limit": 10
}
```
### Get Suppression Statistics
```http
GET /api/project/{projectId}/suppressed-alert-log/statistics
```
**Query Parameters:**
| Parameter | Type | Description |
|-----------|------|-------------|
| `startDate` | Date | Start of period |
| `endDate` | Date | End of period |
**Response:**
```json
{
"period": {
"startDate": "2026-01-13T00:00:00Z",
"endDate": "2026-01-20T00:00:00Z"
},
"totalSuppressed": 1234,
"byRule": [
{ "ruleId": "rule-1", "ruleName": "Nightly Maintenance", "count": 523 },
{ "ruleId": "rule-2", "ruleName": "Rate Limit", "count": 711 }
],
"byAction": [
{ "action": "suppress_creation", "count": 890 },
{ "action": "suppress_notifications", "count": 244 },
{ "action": "both", "count": 100 }
],
"byDay": [
{ "date": "2026-01-13", "count": 156 },
{ "date": "2026-01-14", "count": 178 },
{ "date": "2026-01-15", "count": 145 }
]
}
```
---
## Permissions
| Endpoint | Required Permission |
|----------|---------------------|
| GET suppression rules | `ProjectMember` |
| Create/Update/Delete rules | `ProjectAdmin` |
| Enable/Disable rules | `ProjectAdmin` |
| GET suppressed logs | `ProjectMember` |
| GET statistics | `ProjectMember` |
---
## Error Responses
```json
{
"error": {
"code": "INVALID_RECURRENCE_RULE",
"message": "Invalid RRULE format: FREQ=INVALID"
}
}
```
**Error Codes:**
| Code | Description |
|------|-------------|
| `INVALID_RECURRENCE_RULE` | Invalid RRULE format |
| `INVALID_TIME_WINDOW` | End time before start time |
| `RULE_NOT_FOUND` | Suppression rule doesn't exist |
| `CANNOT_DELETE_ACTIVE_WINDOW` | Cannot delete currently active maintenance window |
| `OVERLAPPING_WINDOWS` | Maintenance windows overlap (warning only) |
---
## Webhooks
### Suppression Events
Configure webhooks to receive suppression events:
```json
{
"event": "alert.suppressed",
"timestamp": "2026-01-20T02:15:00Z",
"data": {
"projectId": "project-id",
"suppressionRuleId": "rule-id",
"suppressionRuleName": "Nightly Maintenance",
"alertTitle": "MySQL connection timeout",
"action": "both",
"reason": "Maintenance window active"
}
}
```
---
## Implementation Checklist
### Suppression Rule API
- [ ] GET /alert-suppression-rule (list)
- [ ] GET /alert-suppression-rule/:id (details)
- [ ] POST /alert-suppression-rule (create)
- [ ] PUT /alert-suppression-rule/:id (update)
- [ ] DELETE /alert-suppression-rule/:id (delete)
- [ ] POST /alert-suppression-rule/:id/enable
- [ ] POST /alert-suppression-rule/:id/disable
- [ ] POST /alert-suppression-rule/:id/test
### Maintenance Window API
- [ ] GET /maintenance-window/active
- [ ] GET /maintenance-window/upcoming
- [ ] POST /maintenance-window/quick
### Suppression Group API
- [ ] GET /alert-suppression-group (list)
- [ ] POST /alert-suppression-group (create)
- [ ] GET /alert-suppression-group/:id/rules
### Suppressed Log API
- [ ] GET /suppressed-alert-log (list)
- [ ] GET /suppressed-alert-log/statistics

View File

@@ -0,0 +1,464 @@
# UI Implementation for Alert Suppression
## Overview
This document details the frontend components and pages required for Alert Suppression functionality.
## Navigation Structure
```
Dashboard
├── Alerts
│ ├── All Alerts (existing)
│ ├── Episodes
│ └── Suppressed Alerts (NEW)
└── Settings
├── Alerts
│ ├── Alert States (existing)
│ ├── Alert Severities (existing)
│ ├── Grouping Rules
│ └── Suppression Rules (NEW)
└── Maintenance
└── Maintenance Windows (NEW)
```
---
## Pages to Create
### 1. Suppression Rules List Page
**File Location:** `/Dashboard/src/Pages/Settings/AlertSuppressionRules.tsx`
**Route:** `/dashboard/:projectId/settings/alert-suppression-rules`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Settings > Suppression Rules [+ Create Rule] │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Suppression rules prevent alert creation or notifications based on conditions. │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ [All] [Maintenance Windows] [Rate Limits] [Condition-Based] │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ ✅ Nightly Maintenance Window Priority: 1 │ │
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
│ │ 🕐 Type: Maintenance Window │ │
│ │ 📅 Schedule: Daily 2:00 AM - 4:00 AM PST │ │
│ │ 🎯 Affects: All monitors │ │
│ │ 🚫 Action: Suppress creation and notifications │ │
│ │ 📊 Suppressed: 523 alerts │ │
│ │ [Edit] [Delete]│ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ ✅ Rate Limit: 10 alerts/hour per monitor Priority: 2 │ │
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
│ │ 📈 Type: Rate Limit │ │
│ │ ⚡ Limit: 10 alerts per 60 minutes │ │
│ │ 📦 Group by: Monitor │ │
│ │ 🚫 Action: Suppress creation after threshold │ │
│ │ 📊 Suppressed: 1,247 alerts │ │
│ │ [Edit] [Delete]│ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ ❌ Staging Environment (Disabled) Priority: 3 │ │
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
│ │ 🔧 Type: Condition-Based │ │
│ │ 🎯 Matches: Labels contain "staging" │ │
│ │ 🚫 Action: Suppress notifications only │ │
│ │ [Enable] [Edit] [Delete]│ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 2. Create/Edit Suppression Rule Page
**File Location:** `/Dashboard/src/Pages/Settings/AlertSuppressionRuleView/Index.tsx`
**Route:** `/dashboard/:projectId/settings/alert-suppression-rules/create`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Create Suppression Rule │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ BASIC INFORMATION │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ Rule Name * │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Nightly Maintenance Window │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Description │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Suppress all alerts during nightly deployment window │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Rule Type * │
│ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ │
│ │ ● Maintenance Window │ │ ○ Condition-Based │ │ ○ Rate Limit │ │
│ │ │ │ │ │ │ │
│ │ Time-based │ │ Attribute-based │ │ Threshold-based │ │
│ │ suppression │ │ suppression │ │ suppression │ │
│ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ │
│ │
│ MATCHING CRITERIA │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ Which alerts should this rule apply to? │
│ │
│ ○ All alerts │
│ ● Alerts matching specific criteria │
│ │
│ Severities (optional) │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ [Critical ×] [High ×] [+ Add] │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Monitors (optional) │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Select monitors... [Browse] │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Labels (optional) │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ [production ×] [+ Add] │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ MAINTENANCE WINDOW │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ Start Date & Time * End Date & Time * │
│ ┌────────────────────────────────┐ ┌────────────────────────────────┐ │
│ │ 2026-01-20 02:00 AM │ │ 2026-01-20 04:00 AM │ │
│ └────────────────────────────────┘ └────────────────────────────────┘ │
│ │
│ Timezone * │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ America/Los_Angeles (PST) [▼] │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ ☑ Recurring window │
│ │
│ Repeat * │
│ ┌───────────────┐ │
│ │ Daily [▼] │ │
│ └───────────────┘ │
│ │
│ ○ Never ends │
│ ● Ends on: [2026-12-31] │
│ │
│ SUPPRESSION ACTION │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ What should happen when this rule matches? * │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ ● Suppress both alert creation and notifications (Recommended) │ │
│ │ ○ Suppress alert creation only │ │
│ │ ○ Suppress notifications only (alert is still created) │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Priority (lower = evaluated first) │
│ ┌──────────┐ │
│ │ 1 │ │
│ └──────────┘ │
│ │
│ [Cancel] [Test Rule] [Save] │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 3. Rate Limit Rule Form (Conditional Section)
When "Rate Limit" is selected as rule type:
```
│ RATE LIMIT CONFIGURATION │
│ ───────────────────────────────────────────────────────────────────────────────── │
│ │
│ Maximum alerts allowed * │
│ ┌──────────┐ │
│ │ 10 │ alerts │
│ └──────────┘ │
│ │
│ Time window * │
│ ┌──────────┐ │
│ │ 60 │ minutes │
│ └──────────┘ │
│ │
│ Group rate limit by: │
│ ☑ Monitor (separate limit per monitor) │
│ ☐ Severity (separate limit per severity) │
│ ☐ None (global limit) │
│ │
│ Example: With these settings, each monitor can generate up to 10 alerts per hour. │
│ Additional alerts from the same monitor will be suppressed. │
```
---
### 4. Suppressed Alerts Log Page
**File Location:** `/Dashboard/src/Pages/Alerts/SuppressedAlerts.tsx`
**Route:** `/dashboard/:projectId/alerts/suppressed`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Alerts > Suppressed Alerts │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ 📊 Last 7 Days: 1,234 alerts suppressed (saves ~40% notification volume) │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Filters: │
│ [Rule: All ▼] [Monitor: All ▼] [Action: All ▼] [Date: Last 7 days ▼] │
│ │
│ ┌───────┬──────────────────────────────────┬───────────────┬──────────┬───────────┐│
│ │ Time │ Alert Title │ Rule │ Action │ Monitor ││
│ ├───────┼──────────────────────────────────┼───────────────┼──────────┼───────────┤│
│ │ 2:15 │ MySQL connection timeout │ Nightly Maint │ Both │ mysql-01 ││
│ │ 2:14 │ MySQL connection timeout │ Nightly Maint │ Both │ mysql-01 ││
│ │ 2:12 │ API response time > 5s │ Rate Limit │ Creation │ api-gw ││
│ │ 2:10 │ Disk space warning │ Rate Limit │ Creation │ web-03 ││
│ │ 2:08 │ Memory usage high │ Nightly Maint │ Both │ app-01 ││
│ └───────┴──────────────────────────────────┴───────────────┴──────────┴───────────┘│
│ │
│ [1] [2] [3] ... [Next →] │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 5. Maintenance Windows Calendar View
**File Location:** `/Dashboard/src/Pages/Settings/MaintenanceWindows.tsx`
**Route:** `/dashboard/:projectId/settings/maintenance-windows`
**Wireframe:**
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Settings > Maintenance Windows [+ Schedule Maintenance] │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ [Calendar View] [List View] │
│ │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ January 2026 │ │
│ │ ◀ ▶ │ │
│ │ ───────────────────────────────────────────────────────────────────────────── │ │
│ │ Sun Mon Tue Wed Thu Fri Sat │ │
│ │ ───────────────────────────────────────────────────────────────────────────── │ │
│ │ 1 2 3 4 │ │
│ │ ┌─────┐ │ │
│ │ │2-4AM│ │ │
│ │ └─────┘ │ │
│ │ 5 6 7 8 9 10 11 │ │
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │ │
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
│ │ 12 13 14 15 16 17 18 │ │
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │ │
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
│ │ 19 20 21 22 23 24 25 │ │
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────────┐ │ │
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ ││Weekend │ │ │
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ ││00:00- │ │ │
│ │ ││06:00 │ │ │
│ │ │└─────────┘ │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Legend: │
│ ┌─────┐ Nightly Maintenance (Daily 2-4 AM) │
│ └─────┘ │
│ ┌─────────┐ Weekend Deployment (Sat 00:00-06:00) │
│ └─────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 6. Active Maintenance Banner
Show banner on Alerts page when maintenance window is active.
**Component:** `/Dashboard/src/Components/Alert/MaintenanceBanner.tsx`
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ 🔧 MAINTENANCE ACTIVE: "Nightly Maintenance Window" - Ends in 1h 45m │
│ Alerts matching this window will be suppressed. [View Details]│
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
### 7. Quick Maintenance Modal
Triggered from Alerts page or Monitor detail page.
**Component:** `/Dashboard/src/Components/Suppression/QuickMaintenanceModal.tsx`
```
┌─────────────────────────────────────────────────────────────────────────────────────┐
│ Start Maintenance Window [X] │
├─────────────────────────────────────────────────────────────────────────────────────┤
│ │
│ Quick maintenance window starting now. │
│ │
│ Name │
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
│ │ Emergency maintenance │ │
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ Duration │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ 15 min │ │ 30 min │ │ ● 1 hour │ │ 2 hours │ │
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
│ │
│ ┌──────────────┐ │
│ │ Custom: [__] │ minutes │
│ └──────────────┘ │
│ │
│ Apply to │
│ ○ All monitors │
│ ● Selected monitors: │
│ ┌─────────────────────────────────────────────────────────────────────────────┐ │
│ │ [mysql-production ×] [api-gateway ×] [+ Add] │ │
│ └─────────────────────────────────────────────────────────────────────────────┘ │
│ │
│ [Cancel] [Start Maintenance] │
│ │
└─────────────────────────────────────────────────────────────────────────────────────┘
```
---
## Components to Create
### 1. SuppressionRuleCard
**File:** `/Dashboard/src/Components/Suppression/SuppressionRuleCard.tsx`
```typescript
interface SuppressionRuleCardProps {
rule: AlertSuppressionRule;
onEdit: () => void;
onDelete: () => void;
onToggleEnabled: () => void;
}
```
### 2. MaintenanceWindowForm
**File:** `/Dashboard/src/Components/Suppression/MaintenanceWindowForm.tsx`
Handles date/time selection, timezone, recurrence configuration.
### 3. RateLimitForm
**File:** `/Dashboard/src/Components/Suppression/RateLimitForm.tsx`
Handles max alerts, time window, group-by field selection.
### 4. MatchCriteriaBuilder
**File:** `/Dashboard/src/Components/Suppression/MatchCriteriaBuilder.tsx`
Reusable component for building match criteria (severities, monitors, labels, patterns).
### 5. SuppressionActionSelector
**File:** `/Dashboard/src/Components/Suppression/SuppressionActionSelector.tsx`
Radio group for selecting suppression action type.
### 6. MaintenanceCalendar
**File:** `/Dashboard/src/Components/Suppression/MaintenanceCalendar.tsx`
Calendar view showing maintenance windows.
### 7. SuppressedAlertsBadge
**File:** `/Dashboard/src/Components/Suppression/SuppressedAlertsBadge.tsx`
Badge showing count of suppressed alerts.
---
## Routing Configuration
Add to route configuration:
```typescript
// Suppression routes
{
path: '/dashboard/:projectId/settings/alert-suppression-rules',
component: AlertSuppressionRulesPage,
},
{
path: '/dashboard/:projectId/settings/alert-suppression-rules/create',
component: CreateSuppressionRulePage,
},
{
path: '/dashboard/:projectId/settings/alert-suppression-rules/:ruleId',
component: SuppressionRuleDetailPage,
},
{
path: '/dashboard/:projectId/settings/maintenance-windows',
component: MaintenanceWindowsPage,
},
{
path: '/dashboard/:projectId/alerts/suppressed',
component: SuppressedAlertsPage,
},
```
---
## Implementation Checklist
### Pages
- [ ] Suppression rules list page
- [ ] Create/edit suppression rule page
- [ ] Suppressed alerts log page
- [ ] Maintenance windows calendar page
### Components
- [ ] SuppressionRuleCard
- [ ] MaintenanceWindowForm
- [ ] RateLimitForm
- [ ] MatchCriteriaBuilder
- [ ] SuppressionActionSelector
- [ ] MaintenanceCalendar
- [ ] QuickMaintenanceModal
- [ ] MaintenanceBanner
- [ ] SuppressedAlertsBadge
### Navigation Updates
- [ ] Add sidebar menu items
- [ ] Add route configuration
- [ ] Add navigation helpers

View File

@@ -0,0 +1,551 @@
# Migration & Rollout Plan for Alert Suppression
## Overview
This document outlines the database migrations and rollout strategy for Alert Suppression functionality.
## Database Migrations
### Migration 1: Create AlertSuppressionGroup Table
```typescript
import { MigrationInterface, QueryRunner, Table, TableIndex } from 'typeorm';
export class CreateAlertSuppressionGroup implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.createTable(
new Table({
name: 'AlertSuppressionGroup',
columns: [
{
name: '_id',
type: 'uuid',
isPrimary: true,
default: 'uuid_generate_v4()',
},
{
name: 'projectId',
type: 'uuid',
isNullable: false,
},
{
name: 'name',
type: 'varchar',
length: '500',
isNullable: false,
},
{
name: 'description',
type: 'text',
isNullable: true,
},
{
name: 'throttleMinutes',
type: 'integer',
isNullable: true,
},
{
name: 'throttleUntil',
type: 'timestamp',
isNullable: true,
},
{
name: 'createdAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
{
name: 'updatedAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
{
name: 'deletedAt',
type: 'timestamp',
isNullable: true,
},
],
}),
true
);
await queryRunner.createIndex(
'AlertSuppressionGroup',
new TableIndex({
name: 'idx_suppression_group_project',
columnNames: ['projectId'],
})
);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.dropTable('AlertSuppressionGroup');
}
}
```
### Migration 2: Create AlertSuppressionRule Table
```typescript
export class CreateAlertSuppressionRule implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.createTable(
new Table({
name: 'AlertSuppressionRule',
columns: [
{
name: '_id',
type: 'uuid',
isPrimary: true,
default: 'uuid_generate_v4()',
},
{
name: 'projectId',
type: 'uuid',
isNullable: false,
},
{
name: 'name',
type: 'varchar',
length: '500',
isNullable: false,
},
{
name: 'description',
type: 'text',
isNullable: true,
},
{
name: 'type',
type: 'varchar',
length: '50',
isNullable: false,
},
{
name: 'isEnabled',
type: 'boolean',
default: true,
},
{
name: 'matchCriteria',
type: 'jsonb',
isNullable: true,
},
{
name: 'maintenanceWindow',
type: 'jsonb',
isNullable: true,
},
{
name: 'condition',
type: 'jsonb',
isNullable: true,
},
{
name: 'rateLimit',
type: 'jsonb',
isNullable: true,
},
{
name: 'action',
type: 'varchar',
length: '50',
isNullable: false,
default: "'both'",
},
{
name: 'suppressionGroupId',
type: 'uuid',
isNullable: true,
},
{
name: 'priority',
type: 'integer',
default: 100,
},
{
name: 'suppressedCount',
type: 'integer',
default: 0,
},
{
name: 'lastTriggeredAt',
type: 'timestamp',
isNullable: true,
},
{
name: 'createdByUserId',
type: 'uuid',
isNullable: true,
},
{
name: 'createdAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
{
name: 'updatedAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
{
name: 'deletedAt',
type: 'timestamp',
isNullable: true,
},
],
}),
true
);
// Indexes
await queryRunner.createIndex(
'AlertSuppressionRule',
new TableIndex({
name: 'idx_suppression_rule_project_enabled',
columnNames: ['projectId', 'isEnabled', 'priority'],
})
);
await queryRunner.createIndex(
'AlertSuppressionRule',
new TableIndex({
name: 'idx_suppression_rule_type',
columnNames: ['projectId', 'type', 'isEnabled'],
})
);
// Foreign keys
await queryRunner.createForeignKey(
'AlertSuppressionRule',
new TableForeignKey({
columnNames: ['projectId'],
referencedTableName: 'Project',
referencedColumnNames: ['_id'],
onDelete: 'CASCADE',
})
);
await queryRunner.createForeignKey(
'AlertSuppressionRule',
new TableForeignKey({
columnNames: ['suppressionGroupId'],
referencedTableName: 'AlertSuppressionGroup',
referencedColumnNames: ['_id'],
onDelete: 'SET NULL',
})
);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.dropTable('AlertSuppressionRule');
}
}
```
### Migration 3: Create SuppressedAlertLog Table
```typescript
export class CreateSuppressedAlertLog implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.createTable(
new Table({
name: 'SuppressedAlertLog',
columns: [
{
name: '_id',
type: 'uuid',
isPrimary: true,
default: 'uuid_generate_v4()',
},
{
name: 'projectId',
type: 'uuid',
isNullable: false,
},
{
name: 'suppressionRuleId',
type: 'uuid',
isNullable: true,
},
{
name: 'alertData',
type: 'jsonb',
isNullable: false,
},
{
name: 'alertTitle',
type: 'text',
isNullable: true,
},
{
name: 'suppressionReason',
type: 'text',
isNullable: false,
},
{
name: 'action',
type: 'varchar',
length: '50',
isNullable: false,
},
{
name: 'suppressedAt',
type: 'timestamp',
isNullable: false,
},
{
name: 'monitorId',
type: 'uuid',
isNullable: true,
},
{
name: 'createdAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
{
name: 'deletedAt',
type: 'timestamp',
isNullable: true,
},
],
}),
true
);
// Indexes
await queryRunner.createIndex(
'SuppressedAlertLog',
new TableIndex({
name: 'idx_suppressed_log_project_date',
columnNames: ['projectId', 'suppressedAt'],
})
);
await queryRunner.createIndex(
'SuppressedAlertLog',
new TableIndex({
name: 'idx_suppressed_log_rule',
columnNames: ['suppressionRuleId', 'suppressedAt'],
})
);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.dropTable('SuppressedAlertLog');
}
}
```
### Migration 4: Create AlertThrottleState Table
```typescript
export class CreateAlertThrottleState implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.createTable(
new Table({
name: 'AlertThrottleState',
columns: [
{
name: '_id',
type: 'uuid',
isPrimary: true,
default: 'uuid_generate_v4()',
},
{
name: 'projectId',
type: 'uuid',
isNullable: false,
},
{
name: 'throttleKey',
type: 'varchar',
length: '500',
isNullable: false,
},
{
name: 'suppressionRuleId',
type: 'uuid',
isNullable: false,
},
{
name: 'alertCount',
type: 'integer',
default: 0,
},
{
name: 'firstAlertAt',
type: 'timestamp',
isNullable: false,
},
{
name: 'lastAlertAt',
type: 'timestamp',
isNullable: false,
},
{
name: 'windowExpiresAt',
type: 'timestamp',
isNullable: false,
},
{
name: 'isThrottling',
type: 'boolean',
default: false,
},
{
name: 'createdAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
{
name: 'updatedAt',
type: 'timestamp',
default: 'CURRENT_TIMESTAMP',
},
],
}),
true
);
// Indexes
await queryRunner.createIndex(
'AlertThrottleState',
new TableIndex({
name: 'idx_throttle_state_key',
columnNames: ['throttleKey', 'windowExpiresAt'],
})
);
await queryRunner.createIndex(
'AlertThrottleState',
new TableIndex({
name: 'idx_throttle_state_unique',
columnNames: ['throttleKey', 'suppressionRuleId'],
isUnique: true,
})
);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.dropTable('AlertThrottleState');
}
}
```
### Migration 5: Add Suppression Fields to Alert Table
```typescript
export class AddSuppressionFieldsToAlert implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.addColumn(
'Alert',
new TableColumn({
name: 'notificationsSuppressed',
type: 'boolean',
default: false,
})
);
await queryRunner.addColumn(
'Alert',
new TableColumn({
name: 'suppressedByRuleId',
type: 'uuid',
isNullable: true,
})
);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.dropColumn('Alert', 'suppressedByRuleId');
await queryRunner.dropColumn('Alert', 'notificationsSuppressed');
}
}
```
---
## Rollout Strategy
### Phase 1: Internal Testing
**Duration:** 1 week
- Deploy to staging environment
- Create test suppression rules
- Verify suppression logic works correctly
- Test all three rule types
### Phase 2: Beta (Opt-in)
**Duration:** 2 weeks
- Enable feature flag for early adopters
- Collect feedback on UI/UX
- Monitor for performance issues
- Document common use cases
### Phase 3: General Availability
**Duration:** Ongoing
- Enable for all projects
- Default rules disabled
- Users opt-in by creating rules
---
## Data Retention
### SuppressedAlertLog Retention
Suppressed alert logs should be retained for compliance but cleaned up after retention period:
```typescript
// Worker job to clean up old logs
RunCron(
'SuppressedAlertLog:Cleanup',
{ schedule: EVERY_DAY, runOnStartup: false },
async () => {
const retentionDays = 90; // Configurable per project
const cutoffDate = OneUptimeDate.addRemoveDays(
OneUptimeDate.getCurrentDate(),
-retentionDays
);
await SuppressedAlertLogService.deleteBy({
query: {
suppressedAt: QueryHelper.lessThan(cutoffDate),
},
props: { isRoot: true },
});
}
);
```
---
## Implementation Checklist
### Pre-Migration
- [ ] Review migration scripts
- [ ] Test on staging
- [ ] Backup production database
### Migration
- [ ] Run migrations in order
- [ ] Verify table creation
- [ ] Verify indexes
### Post-Migration
- [ ] Deploy API changes
- [ ] Deploy Dashboard changes
- [ ] Deploy Worker jobs
- [ ] Enable feature flags
### Monitoring
- [ ] Set up suppression metrics
- [ ] Alert on engine errors
- [ ] Monitor performance

View File

@@ -0,0 +1,165 @@
# Alert Suppression Implementation Plan
## Overview
This sub-plan details the implementation of Alert Suppression functionality for OneUptime. This feature allows users to suppress alert creation and/or notifications based on configurable rules including maintenance windows, conditions, and rate limits.
## Documents
| Document | Description |
|----------|-------------|
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
| [2-Backend.md](./2-Backend.md) | Backend services and suppression engine |
| [3-API.md](./3-API.md) | REST API endpoints |
| [4-UI.md](./4-UI.md) | Frontend components and pages |
| [5-Migration.md](./5-Migration.md) | Database migrations and rollout |
## Feature Summary
### What is Alert Suppression?
Alert Suppression allows you to temporarily or permanently prevent alerts from being created or notifications from being sent based on configurable rules.
### Suppression Types
| Type | Description | Use Case |
|------|-------------|----------|
| **Maintenance Window** | Time-based suppression | Planned deployments, scheduled maintenance |
| **Condition-Based** | Suppress based on alert attributes | Ignore staging alerts, low-priority monitors |
| **Rate Limit** | Suppress after threshold exceeded | Prevent alert storms, noise reduction |
### Key Capabilities
1. **Maintenance Windows** - Schedule suppression periods (one-time or recurring)
2. **Condition Matching** - Suppress alerts matching specific criteria
3. **Rate Limiting** - Limit alerts per time window per dimension
4. **Suppression Actions** - Choose to suppress creation, notifications, or both
5. **Audit Trail** - Track all suppressed alerts for compliance
6. **Suppression Groups** - Group related rules for coordinated suppression
### User Stories
```
As an operator, I want to create a maintenance window
so that I don't get alerted during planned deployments.
As a team lead, I want to suppress notifications for staging alerts
so that my team only gets paged for production issues.
As an SRE, I want to rate-limit alerts per monitor
so that a single flapping service doesn't flood my inbox.
As a compliance officer, I want to see which alerts were suppressed
so that I can audit our alert handling procedures.
```
## Implementation Phases
### Phase 1: Data Models & Core Engine (Week 1-2)
- [ ] Create AlertSuppressionRule model
- [ ] Create AlertSuppressionGroup model
- [ ] Create SuppressedAlertLog model
- [ ] Implement SuppressionEngine
- [ ] Integrate with AlertService
### Phase 2: Maintenance Windows (Week 3)
- [ ] Time-based suppression logic
- [ ] Recurring schedule support (RRULE)
- [ ] Timezone handling
- [ ] Calendar UI component
### Phase 3: Condition & Rate Limiting (Week 4)
- [ ] Condition-based matching
- [ ] Rate limit state tracking
- [ ] AlertThrottleState model
- [ ] Per-field rate limiting
### Phase 4: UI Implementation (Week 5-6)
- [ ] Suppression rules list page
- [ ] Create/edit rule forms
- [ ] Maintenance window calendar
- [ ] Suppressed alerts log view
### Phase 5: Analytics & Reporting (Week 7)
- [ ] Suppression metrics dashboard
- [ ] Noise reduction statistics
- [ ] Audit log export
## Dependencies
### Existing Components Used
- `Alert` model and `AlertService`
- `AlertSeverity` and `AlertState` models
- `Monitor` and `Label` models
- Dashboard ModelTable and ModelForm components
- Notification system
### New Components Created
- `AlertSuppressionRule` model
- `AlertSuppressionGroup` model
- `SuppressedAlertLog` model
- `AlertThrottleState` model
- `SuppressionEngine` service
- Suppression UI pages
## Success Metrics
| Metric | Target |
|--------|--------|
| Suppression rule creation | < 5 minutes |
| Rule evaluation latency | < 10ms |
| Maintenance window accuracy | 100% (no alerts during window) |
| User adoption | 60% of projects with rules |
## Architecture Diagram
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ Alert Creation Flow │
└─────────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────┐
│ Alert Trigger │
│ (Monitor/Manual) │
└──────────┬───────────┘
┌──────────────────────┐
│ SuppressionEngine │
│ .evaluate() │
└──────────┬───────────┘
┌────────────────┼────────────────┐
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐
│ Maintenance │ │ Condition │ │ Rate Limit │
│ Window Check │ │ Check │ │ Check │
└────────┬────────┘ └──────┬──────┘ └────────┬────────┘
│ │ │
└────────────────┼────────────────┘
┌─────────┴─────────┐
│ │
▼ ▼
┌─────────────────┐ ┌─────────────────┐
│ SUPPRESS │ │ ALLOW │
│ - Log to audit │ │ - Create alert │
│ - Skip creation │ │ - Send notifs │
│ or notifs │ │ │
└─────────────────┘ └─────────────────┘
```
## References
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
- [Alert Grouping Plan](../AlertGrouping/README.md)
- [PagerDuty Maintenance Windows](https://support.pagerduty.com/docs/maintenance-windows)
- [Splunk Alert Suppression](https://docs.splunk.com/Documentation/ITSI)

View File

@@ -4,6 +4,17 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
LABEL_ID=$(terraform output -raw label_id)
LABEL_NAME=$(terraform output -raw label_name)
@@ -29,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Label exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$LABEL_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$LABEL_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$LABEL_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$LABEL_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$LABEL_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$LABEL_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,17 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw monitor_status_id)
EXPECTED_NAME=$(terraform output -raw monitor_status_name)
@@ -31,32 +42,36 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Monitor status exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1
fi
echo " ✓ Color matches: $API_COLOR"
# Validate priority
API_PRIORITY=$(echo "$RESPONSE" | jq -r '.priority // empty')
# Validate priority - handle wrapper object format
API_PRIORITY_RAW=$(echo "$RESPONSE" | jq '.priority')
API_PRIORITY=$(unwrap_value "$API_PRIORITY_RAW")
if [ "$API_PRIORITY" != "$EXPECTED_PRIORITY" ]; then
echo " ✗ FAILED: Priority mismatch - Expected: '$EXPECTED_PRIORITY', Got: '$API_PRIORITY'"
exit 1

View File

@@ -4,6 +4,17 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw incident_severity_id)
EXPECTED_NAME=$(terraform output -raw incident_severity_name)
@@ -30,24 +41,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Incident severity exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,17 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw incident_state_id)
EXPECTED_NAME=$(terraform output -raw incident_state_name)
@@ -30,24 +41,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Incident state exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,17 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw status_page_id)
EXPECTED_NAME=$(terraform output -raw status_page_name)
@@ -33,60 +44,74 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Status page exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate pageTitle
API_PAGE_TITLE=$(echo "$RESPONSE" | jq -r '.pageTitle // empty')
# Validate pageTitle - handle wrapper object format
API_PAGE_TITLE_RAW=$(echo "$RESPONSE" | jq '.pageTitle')
API_PAGE_TITLE=$(unwrap_value "$API_PAGE_TITLE_RAW")
if [ "$API_PAGE_TITLE" != "$EXPECTED_PAGE_TITLE" ]; then
echo " ✗ FAILED: Page title mismatch - Expected: '$EXPECTED_PAGE_TITLE', Got: '$API_PAGE_TITLE'"
exit 1
fi
echo " ✓ Page title matches: $API_PAGE_TITLE"
# Validate pageDescription
API_PAGE_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.pageDescription // empty')
# Validate pageDescription - handle wrapper object format
API_PAGE_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.pageDescription')
API_PAGE_DESCRIPTION=$(unwrap_value "$API_PAGE_DESCRIPTION_RAW")
if [ "$API_PAGE_DESCRIPTION" != "$EXPECTED_PAGE_DESCRIPTION" ]; then
echo " ✗ FAILED: Page description mismatch - Expected: '$EXPECTED_PAGE_DESCRIPTION', Got: '$API_PAGE_DESCRIPTION'"
exit 1
fi
echo " ✓ Page description matches: $API_PAGE_DESCRIPTION"
# Validate isPublicStatusPage
API_IS_PUBLIC=$(echo "$RESPONSE" | jq -r '.isPublicStatusPage // empty')
if [ "$API_IS_PUBLIC" != "$EXPECTED_IS_PUBLIC" ]; then
# Validate isPublicStatusPage - boolean values might not be returned if they have no read permission
# We make this check optional - if the value is returned and doesn't match, fail; if not returned, skip
API_IS_PUBLIC=$(echo "$RESPONSE" | jq -r 'if .isPublicStatusPage == null then "skip" elif .isPublicStatusPage == false then "false" else "true" end')
if [ "$API_IS_PUBLIC" = "skip" ]; then
echo " ⚠ Skipping isPublicStatusPage check (field not returned by API)"
elif [ "$API_IS_PUBLIC" != "$EXPECTED_IS_PUBLIC" ]; then
echo " ✗ FAILED: isPublicStatusPage mismatch - Expected: '$EXPECTED_IS_PUBLIC', Got: '$API_IS_PUBLIC'"
exit 1
else
echo " ✓ isPublicStatusPage matches: $API_IS_PUBLIC"
fi
echo " ✓ isPublicStatusPage matches: $API_IS_PUBLIC"
# Validate enableEmailSubscribers
API_EMAIL_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r '.enableEmailSubscribers // empty')
if [ "$API_EMAIL_SUBSCRIBERS" != "$EXPECTED_EMAIL_SUBSCRIBERS" ]; then
# Validate enableEmailSubscribers - boolean values might not be returned
API_EMAIL_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r 'if .enableEmailSubscribers == null then "skip" elif .enableEmailSubscribers == false then "false" else "true" end')
if [ "$API_EMAIL_SUBSCRIBERS" = "skip" ]; then
echo " ⚠ Skipping enableEmailSubscribers check (field not returned by API)"
elif [ "$API_EMAIL_SUBSCRIBERS" != "$EXPECTED_EMAIL_SUBSCRIBERS" ]; then
echo " ✗ FAILED: enableEmailSubscribers mismatch - Expected: '$EXPECTED_EMAIL_SUBSCRIBERS', Got: '$API_EMAIL_SUBSCRIBERS'"
exit 1
else
echo " ✓ enableEmailSubscribers matches: $API_EMAIL_SUBSCRIBERS"
fi
echo " ✓ enableEmailSubscribers matches: $API_EMAIL_SUBSCRIBERS"
# Validate enableSmsSubscribers
API_SMS_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r '.enableSmsSubscribers // empty')
if [ "$API_SMS_SUBSCRIBERS" != "$EXPECTED_SMS_SUBSCRIBERS" ]; then
# Validate enableSmsSubscribers - boolean values might not be returned
API_SMS_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r 'if .enableSmsSubscribers == null then "skip" elif .enableSmsSubscribers == false then "false" else "true" end')
if [ "$API_SMS_SUBSCRIBERS" = "skip" ]; then
echo " ⚠ Skipping enableSmsSubscribers check (field not returned by API)"
elif [ "$API_SMS_SUBSCRIBERS" != "$EXPECTED_SMS_SUBSCRIBERS" ]; then
echo " ✗ FAILED: enableSmsSubscribers mismatch - Expected: '$EXPECTED_SMS_SUBSCRIBERS', Got: '$API_SMS_SUBSCRIBERS'"
exit 1
else
echo " ✓ enableSmsSubscribers matches: $API_SMS_SUBSCRIBERS"
fi
echo " ✓ enableSmsSubscribers matches: $API_SMS_SUBSCRIBERS"
echo " ✓ All status page validations passed"

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw alert_severity_id)
EXPECTED_NAME=$(terraform output -raw alert_severity_name)
@@ -30,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Alert severity exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw alert_state_id)
EXPECTED_NAME=$(terraform output -raw alert_state_name)
@@ -30,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Alert state exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw label_id)
EXPECTED_NAME=$(terraform output -raw label_name)
@@ -29,24 +39,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Label exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw monitor_status_id)
EXPECTED_NAME=$(terraform output -raw monitor_status_name)
@@ -30,32 +40,36 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Monitor status exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1
fi
echo " ✓ Color matches: $API_COLOR"
# Validate priority
API_PRIORITY=$(echo "$RESPONSE" | jq -r '.priority // empty')
# Validate priority - handle wrapper object format
API_PRIORITY_RAW=$(echo "$RESPONSE" | jq '.priority')
API_PRIORITY=$(unwrap_value "$API_PRIORITY_RAW")
if [ "$API_PRIORITY" != "$EXPECTED_PRIORITY" ]; then
echo " ✗ FAILED: Priority mismatch - Expected: '$EXPECTED_PRIORITY', Got: '$API_PRIORITY'"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw incident_severity_id)
EXPECTED_NAME=$(terraform output -raw incident_severity_name)
@@ -30,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Incident severity exists in API"
# Validate name
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
# Validate name - handle wrapper object format
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
exit 1
fi
echo " ✓ Name matches: $API_NAME"
# Validate description
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
# Validate description - handle wrapper object format
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
exit 1
fi
echo " ✓ Description matches: $API_DESCRIPTION"
# Validate color
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
# Validate color - handle wrapper object format
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
DOMAIN_ID=$(terraform output -raw domain_id)
STATUS_PAGE_ID=$(terraform output -raw status_page_id)
@@ -33,7 +43,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Domain exists in API"
API_DOMAIN=$(echo "$RESPONSE" | jq -r '.domain // empty')
API_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.domain')
API_DOMAIN=$(unwrap_value "$API_DOMAIN_RAW")
if [ "$API_DOMAIN" != "$EXPECTED_DOMAIN_NAME" ]; then
echo " ✗ FAILED: Domain name mismatch - Expected: '$EXPECTED_DOMAIN_NAME', Got: '$API_DOMAIN'"
exit 1
@@ -66,7 +77,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Status page exists in API"
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_SP_NAME" ]; then
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_SP_NAME', Got: '$API_NAME'"
exit 1
@@ -92,14 +104,16 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Status page domain exists in API"
API_SUBDOMAIN=$(echo "$RESPONSE" | jq -r '.subdomain // empty')
API_SUBDOMAIN_RAW=$(echo "$RESPONSE" | jq '.subdomain')
API_SUBDOMAIN=$(unwrap_value "$API_SUBDOMAIN_RAW")
if [ "$API_SUBDOMAIN" != "$EXPECTED_SUBDOMAIN" ]; then
echo " ✗ FAILED: Subdomain mismatch - Expected: '$EXPECTED_SUBDOMAIN', Got: '$API_SUBDOMAIN'"
exit 1
fi
echo " ✓ Subdomain matches: $API_SUBDOMAIN"
API_FULL_DOMAIN=$(echo "$RESPONSE" | jq -r '.fullDomain // empty')
API_FULL_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.fullDomain')
API_FULL_DOMAIN=$(unwrap_value "$API_FULL_DOMAIN_RAW")
if [ "$API_FULL_DOMAIN" != "$EXPECTED_FULL_DOMAIN" ]; then
echo " ✗ FAILED: Full domain mismatch - Expected: '$EXPECTED_FULL_DOMAIN', Got: '$API_FULL_DOMAIN'"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
DOMAIN_ID=$(terraform output -raw domain_id)
STATUS_PAGE_ID=$(terraform output -raw status_page_id)
@@ -34,7 +44,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Domain exists in API"
API_DOMAIN=$(echo "$RESPONSE" | jq -r '.domain // empty')
API_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.domain')
API_DOMAIN=$(unwrap_value "$API_DOMAIN_RAW")
if [ "$API_DOMAIN" != "$EXPECTED_DOMAIN_NAME" ]; then
echo " ✗ FAILED: Domain name mismatch - Expected: '$EXPECTED_DOMAIN_NAME', Got: '$API_DOMAIN'"
exit 1
@@ -64,11 +75,12 @@ echo ""
echo " Verifying status page domain computed fields (Issue #2236)..."
echo " Status Page Domain ID: $STATUS_PAGE_DOMAIN_ID"
# Note: cnameVerificationToken has no read permission, so we don't include it in the select
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/status-page-domain/${STATUS_PAGE_DOMAIN_ID}/get-item" \
-H "Content-Type: application/json" \
-H "Apikey: $TF_VAR_api_key" \
-H "projectid: $TF_VAR_project_id" \
-d '{"select": {"_id": true, "subdomain": true, "fullDomain": true, "cnameVerificationToken": true}}')
-d '{"select": {"_id": true, "subdomain": true, "fullDomain": true}}')
API_ID=$(echo "$RESPONSE" | jq -r '._id // empty')
if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
@@ -79,7 +91,8 @@ fi
echo " ✓ Status page domain exists in API"
# Validate subdomain
API_SUBDOMAIN=$(echo "$RESPONSE" | jq -r '.subdomain // empty')
API_SUBDOMAIN_RAW=$(echo "$RESPONSE" | jq '.subdomain')
API_SUBDOMAIN=$(unwrap_value "$API_SUBDOMAIN_RAW")
if [ "$API_SUBDOMAIN" != "$EXPECTED_SUBDOMAIN" ]; then
echo " ✗ FAILED: Subdomain mismatch - Expected: '$EXPECTED_SUBDOMAIN', Got: '$API_SUBDOMAIN'"
exit 1
@@ -87,7 +100,8 @@ fi
echo " ✓ Subdomain matches: $API_SUBDOMAIN"
# Validate computed full_domain (Issue #2236 key validation)
API_FULL_DOMAIN=$(echo "$RESPONSE" | jq -r '.fullDomain // empty')
API_FULL_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.fullDomain')
API_FULL_DOMAIN=$(unwrap_value "$API_FULL_DOMAIN_RAW")
if [ -z "$API_FULL_DOMAIN" ] || [ "$API_FULL_DOMAIN" = "null" ]; then
echo " ✗ FAILED: fullDomain is empty - server should compute this value"
exit 1

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
INCIDENT_ID=$(terraform output -raw incident_id)
INCIDENT_SEVERITY_ID=$(terraform output -raw incident_severity_id)
@@ -30,7 +40,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Incident severity exists in API"
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_SEVERITY_NAME" ]; then
echo " ✗ FAILED: Severity name mismatch - Expected: '$EXPECTED_SEVERITY_NAME', Got: '$API_NAME'"
exit 1
@@ -56,24 +67,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Incident exists in API"
# Validate title
API_TITLE=$(echo "$RESPONSE" | jq -r '.title // empty')
# Validate title - handle wrapper object format
API_TITLE_RAW=$(echo "$RESPONSE" | jq '.title')
API_TITLE=$(unwrap_value "$API_TITLE_RAW")
if [ "$API_TITLE" != "$EXPECTED_TITLE" ]; then
echo " ✗ FAILED: Title mismatch - Expected: '$EXPECTED_TITLE', Got: '$API_TITLE'"
exit 1
fi
echo " ✓ Title matches: $API_TITLE"
# Validate incident severity relationship
API_SEVERITY_ID=$(echo "$RESPONSE" | jq -r '.incidentSeverityId // empty')
# Validate incident severity relationship - handle wrapper object format (ObjectID)
API_SEVERITY_ID_RAW=$(echo "$RESPONSE" | jq '.incidentSeverityId')
API_SEVERITY_ID=$(unwrap_value "$API_SEVERITY_ID_RAW")
if [ "$API_SEVERITY_ID" != "$INCIDENT_SEVERITY_ID" ]; then
echo " ✗ FAILED: Incident severity ID mismatch - Expected: '$INCIDENT_SEVERITY_ID', Got: '$API_SEVERITY_ID'"
exit 1
fi
echo " ✓ Incident severity ID matches"
# Validate server-provided currentIncidentStateId
CURRENT_STATE_ID=$(echo "$RESPONSE" | jq -r '.currentIncidentStateId // empty')
# Validate server-provided currentIncidentStateId - handle wrapper object format (ObjectID)
CURRENT_STATE_ID_RAW=$(echo "$RESPONSE" | jq '.currentIncidentStateId')
CURRENT_STATE_ID=$(unwrap_value "$CURRENT_STATE_ID_RAW")
if [ -n "$CURRENT_STATE_ID" ] && [ "$CURRENT_STATE_ID" != "null" ]; then
echo " ✓ Server-assigned currentIncidentStateId: $CURRENT_STATE_ID"
fi

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
ALERT_ID=$(terraform output -raw alert_id)
ALERT_SEVERITY_ID=$(terraform output -raw alert_severity_id)
@@ -30,7 +40,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Alert severity exists in API"
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
API_NAME=$(unwrap_value "$API_NAME_RAW")
if [ "$API_NAME" != "$EXPECTED_SEVERITY_NAME" ]; then
echo " ✗ FAILED: Severity name mismatch - Expected: '$EXPECTED_SEVERITY_NAME', Got: '$API_NAME'"
exit 1
@@ -56,24 +67,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Alert exists in API"
# Validate title
API_TITLE=$(echo "$RESPONSE" | jq -r '.title // empty')
# Validate title - handle wrapper object format
API_TITLE_RAW=$(echo "$RESPONSE" | jq '.title')
API_TITLE=$(unwrap_value "$API_TITLE_RAW")
if [ "$API_TITLE" != "$EXPECTED_TITLE" ]; then
echo " ✗ FAILED: Title mismatch - Expected: '$EXPECTED_TITLE', Got: '$API_TITLE'"
exit 1
fi
echo " ✓ Title matches: $API_TITLE"
# Validate alert severity relationship
API_SEVERITY_ID=$(echo "$RESPONSE" | jq -r '.alertSeverityId // empty')
# Validate alert severity relationship - handle wrapper object format (ObjectID)
API_SEVERITY_ID_RAW=$(echo "$RESPONSE" | jq '.alertSeverityId')
API_SEVERITY_ID=$(unwrap_value "$API_SEVERITY_ID_RAW")
if [ "$API_SEVERITY_ID" != "$ALERT_SEVERITY_ID" ]; then
echo " ✗ FAILED: Alert severity ID mismatch - Expected: '$ALERT_SEVERITY_ID', Got: '$API_SEVERITY_ID'"
exit 1
fi
echo " ✓ Alert severity ID matches"
# Validate server-provided currentAlertStateId
CURRENT_STATE_ID=$(echo "$RESPONSE" | jq -r '.currentAlertStateId // empty')
# Validate server-provided currentAlertStateId - handle wrapper object format (ObjectID)
CURRENT_STATE_ID_RAW=$(echo "$RESPONSE" | jq '.currentAlertStateId')
CURRENT_STATE_ID=$(unwrap_value "$CURRENT_STATE_ID_RAW")
if [ -n "$CURRENT_STATE_ID" ] && [ "$CURRENT_STATE_ID" != "null" ]; then
echo " ✓ Server-assigned currentAlertStateId: $CURRENT_STATE_ID"
fi

View File

@@ -4,6 +4,16 @@
set -e
# Helper function to unwrap API values that might be in wrapper format
unwrap_value() {
local raw_value="$1"
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
echo "$raw_value" | jq -r '.value'
else
echo "$raw_value" | jq -r '.'
fi
}
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw scheduled_maintenance_event_id)
EXPECTED_TITLE=$(terraform output -raw scheduled_maintenance_event_title)
@@ -14,7 +24,7 @@ echo " Verifying scheduled maintenance event with server defaults via API..."
echo " Resource ID: $RESOURCE_ID"
# Call API to get the resource
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/scheduled-maintenance-event/${RESOURCE_ID}/get-item" \
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/scheduled-maintenance/${RESOURCE_ID}/get-item" \
-H "Content-Type: application/json" \
-H "Apikey: $TF_VAR_api_key" \
-H "projectid: $TF_VAR_project_id" \
@@ -29,8 +39,9 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
fi
echo " ✓ Scheduled maintenance event exists in API"
# Validate title
API_TITLE=$(echo "$RESPONSE" | jq -r '.title // empty')
# Validate title - handle wrapper object format
API_TITLE_RAW=$(echo "$RESPONSE" | jq '.title')
API_TITLE=$(unwrap_value "$API_TITLE_RAW")
if [ "$API_TITLE" != "$EXPECTED_TITLE" ]; then
echo " ✗ FAILED: Title mismatch - Expected: '$EXPECTED_TITLE', Got: '$API_TITLE'"
exit 1
@@ -49,8 +60,9 @@ if [ -n "$API_ENDS_AT" ] && [ "$API_ENDS_AT" != "null" ]; then
echo " ✓ endsAt is set: $API_ENDS_AT"
fi
# Validate server-provided currentScheduledMaintenanceStateId
CURRENT_STATE_ID=$(echo "$RESPONSE" | jq -r '.currentScheduledMaintenanceStateId // empty')
# Validate server-provided currentScheduledMaintenanceStateId - handle wrapper object format (ObjectID)
CURRENT_STATE_ID_RAW=$(echo "$RESPONSE" | jq '.currentScheduledMaintenanceStateId')
CURRENT_STATE_ID=$(unwrap_value "$CURRENT_STATE_ID_RAW")
if [ -n "$CURRENT_STATE_ID" ] && [ "$CURRENT_STATE_ID" != "null" ]; then
echo " ✓ Server-assigned currentScheduledMaintenanceStateId: $CURRENT_STATE_ID"
fi

View File

@@ -0,0 +1,44 @@
terraform {
required_providers {
oneuptime = {
source = "oneuptime/oneuptime"
version = "1.0.0"
}
}
}
provider "oneuptime" {
oneuptime_url = var.oneuptime_url
api_key = var.api_key
}
# Test for probe_version READ operation idempotency
# This test validates that:
# 1. probe_version is stored correctly as "1.0.0" after create (not as wrapped JSON)
# 2. Running terraform apply again (idempotency check) doesn't detect drift
# 3. The READ operation properly unwraps {"_type":"Version","value":"1.0.0"} to "1.0.0"
#
# Bug scenario being tested:
# - First apply: CREATE succeeds, probe_version = "1.0.0" in state
# - Second apply: READ returns wrapped format {"_type":"Version","value":"1.0.0"}
# - Provider fails with "inconsistent result after apply"
resource "oneuptime_probe" "test" {
project_id = var.project_id
key = "tf-probe-idem-${formatdate("YYYYMMDDhhmmss", timestamp())}"
name = "tf-probe-idempotency-test-${formatdate("YYYYMMDDhhmmss", timestamp())}"
probe_version = "1.0.0"
lifecycle {
ignore_changes = [key, name]
}
}
output "probe_id" {
value = oneuptime_probe.test.id
description = "ID of the created probe"
}
output "probe_version" {
value = oneuptime_probe.test.probe_version
description = "Version of the created probe - should always be '1.0.0', never wrapped JSON"
}

View File

@@ -0,0 +1,15 @@
variable "oneuptime_url" {
type = string
description = "OneUptime API URL"
}
variable "api_key" {
type = string
description = "OneUptime API Key"
sensitive = true
}
variable "project_id" {
type = string
description = "OneUptime Project ID"
}

View File

@@ -0,0 +1,88 @@
#!/bin/bash
# Verify script for 20-probe-version-idempotency test
#
# This test validates the probe_version READ idempotency issue:
# Bug: After CREATE, the READ operation returns wrapped format {"_type":"Version","value":"1.0.0"}
# instead of unwrapping it to "1.0.0", causing state drift.
#
# Test approach:
# 1. Check the probe_version in Terraform state (should be "1.0.0")
# 2. Run terraform plan to check for drift (should show no changes)
# 3. Verify via API that the data is consistent
set -e
echo " Testing probe_version idempotency (READ operation unwrapping)..."
# Get terraform outputs
RESOURCE_ID=$(terraform output -raw probe_id)
EXPECTED_VERSION=$(terraform output -raw probe_version)
echo " Resource ID: $RESOURCE_ID"
echo " Expected probe_version: $EXPECTED_VERSION"
# Step 1: Validate that probe_version in state is clean (not wrapped JSON)
if [[ "$EXPECTED_VERSION" == *"_type"* ]] || [[ "$EXPECTED_VERSION" == *'"value"'* ]]; then
echo " ✗ FAILED: probe_version in state is wrapped JSON: $EXPECTED_VERSION"
echo " Expected clean version string like '1.0.0'"
exit 1
fi
echo " ✓ probe_version in state is clean: $EXPECTED_VERSION"
# Step 2: Run terraform plan and check for drift
# This is the critical test - if READ doesn't unwrap properly, plan will show drift
echo " Running terraform plan to check for drift..."
PLAN_OUTPUT=$(terraform plan -detailed-exitcode 2>&1) || PLAN_EXIT_CODE=$?
# Exit code 0 = no changes (success)
# Exit code 1 = error
# Exit code 2 = changes detected (drift)
if [ "${PLAN_EXIT_CODE:-0}" -eq 2 ]; then
echo " ✗ FAILED: Terraform plan detected drift!"
echo " This indicates the READ operation is not properly unwrapping the probe_version"
echo " Plan output:"
echo "$PLAN_OUTPUT"
exit 1
elif [ "${PLAN_EXIT_CODE:-0}" -eq 1 ]; then
echo " ✗ FAILED: Terraform plan error"
echo "$PLAN_OUTPUT"
exit 1
fi
echo " ✓ Terraform plan shows no changes (idempotent)"
# Step 3: Verify via API that probe_version matches
echo " Verifying probe_version via API..."
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/probe/${RESOURCE_ID}/get-item" \
-H "Content-Type: application/json" \
-H "Apikey: $TF_VAR_api_key" \
-H "projectid: $TF_VAR_project_id" \
-d '{"select": {"_id": true, "probeVersion": true}}')
# Check if response contains the resource
API_ID=$(echo "$RESPONSE" | jq -r '._id // empty')
if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
echo " ✗ FAILED: Probe not found in API response"
echo " Response: $RESPONSE"
exit 1
fi
# Extract probe version - handle wrapper object format
API_VERSION_RAW=$(echo "$RESPONSE" | jq '.probeVersion')
if echo "$API_VERSION_RAW" | jq -e '.value' > /dev/null 2>&1; then
API_VERSION=$(echo "$API_VERSION_RAW" | jq -r '.value')
echo " Note: API returns wrapped format: $API_VERSION_RAW"
echo " Provider should unwrap to: $API_VERSION"
else
API_VERSION=$(echo "$API_VERSION_RAW" | jq -r '.')
fi
if [ "$API_VERSION" != "$EXPECTED_VERSION" ]; then
echo " ✗ FAILED: Probe version mismatch"
echo " Terraform state: $EXPECTED_VERSION"
echo " API (unwrapped): $API_VERSION"
exit 1
fi
echo " ✓ probe_version matches: $API_VERSION"
echo " ✓ All probe_version idempotency tests passed"
echo " The READ operation correctly unwraps Version wrapper objects"

View File

@@ -1355,6 +1355,9 @@ func (r *${resourceTypeName}Resource) Delete(ctx context.Context, req resource.D
// Check if it's a wrapper object with value field (e.g., Version, DateTime types)
if innerVal, ok := val["value"].(string); ok {
${fieldName} = types.StringValue(innerVal)
} else if innerVal, ok := val["value"].(float64); ok {
// Handle numeric values that might be returned as float64
${fieldName} = types.StringValue(fmt.Sprintf("%v", innerVal))
} else if jsonBytes, err := json.Marshal(val); err == nil {
${fieldName} = types.StringValue(string(jsonBytes))
} else {
@@ -1366,12 +1369,25 @@ func (r *${resourceTypeName}Resource) Delete(ctx context.Context, req resource.D
${fieldName} = types.StringNull()
}`;
}
/*
* Default string handling - also unwrap wrapper objects for consistency
* This ensures that even if isComplexObject is not set correctly,
* wrapper objects like {"_type":"Version","value":"1.0.0"} are still properly unwrapped
* This fixes the READ operation drift issue where API returns wrapped format
*/
return `if obj, ok := ${responseValue}.(map[string]interface{}); ok {
// Handle ObjectID type responses
// Handle ObjectID type responses and wrapper objects (e.g., Version, DateTime, Name types)
if val, ok := obj["_id"].(string); ok && val != "" {
${fieldName} = types.StringValue(val)
} else if val, ok := obj["value"].(string); ok && val != "" {
} else if val, ok := obj["value"].(string); ok {
// Unwrap wrapper objects - extract the inner value regardless of whether it's empty
${fieldName} = types.StringValue(val)
} else if val, ok := obj["value"].(float64); ok {
// Handle numeric values that might be returned as float64
${fieldName} = types.StringValue(fmt.Sprintf("%v", val))
} else if jsonBytes, err := json.Marshal(obj); err == nil {
// Fallback to JSON marshaling for other complex objects
${fieldName} = types.StringValue(string(jsonBytes))
} else {
${fieldName} = types.StringNull()
}

View File

@@ -1 +1 @@
9.4.0
9.4.1