mirror of
https://github.com/OneUptime/oneuptime.git
synced 2026-04-06 00:32:12 +02:00
548
Docs/Plan/AlertDeduplication/1-DataModels.md
Normal file
548
Docs/Plan/AlertDeduplication/1-DataModels.md
Normal file
@@ -0,0 +1,548 @@
|
||||
# Data Models for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the database models required for Alert Deduplication functionality.
|
||||
|
||||
## Entity Relationship Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────┐
|
||||
│ AlertFingerprint │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ fingerprint (hash) │◄──────┐
|
||||
│ fingerprintFields │ │
|
||||
│ canonicalAlertId │───────┼──► Alert
|
||||
│ duplicateCount │ │
|
||||
│ windowStartAt │ │
|
||||
│ windowEndAt │ │
|
||||
└─────────────────────────┘ │
|
||||
│
|
||||
┌─────────────────────────────────┴───────────────────────────────────┐
|
||||
│ Alert (existing) │
|
||||
├─────────────────────────────────────────────────────────────────────┤
|
||||
│ + fingerprint (NEW) - SHA-256 hash of alert │
|
||||
│ + duplicateCount (NEW) - Number of duplicates suppressed │
|
||||
│ + lastDuplicateAt (NEW) - When last duplicate occurred │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Definitions
|
||||
|
||||
### 1. AlertFingerprint
|
||||
|
||||
Cache of active fingerprints for deduplication lookups.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/AlertFingerprint.ts`
|
||||
|
||||
```typescript
|
||||
import {
|
||||
Column,
|
||||
Entity,
|
||||
Index,
|
||||
JoinColumn,
|
||||
ManyToOne,
|
||||
} from 'typeorm';
|
||||
import BaseModel from './DatabaseBaseModel/DatabaseBaseModel';
|
||||
import Project from './Project';
|
||||
import Alert from './Alert';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import ColumnType from 'Common/Types/Database/ColumnType';
|
||||
import TableColumnType from 'Common/Types/Database/TableColumnType';
|
||||
import Permission from 'Common/Types/Permission';
|
||||
import IconProp from 'Common/Types/Icon/IconProp';
|
||||
|
||||
@TableMetadata({
|
||||
tableName: 'AlertFingerprint',
|
||||
singularName: 'Alert Fingerprint',
|
||||
pluralName: 'Alert Fingerprints',
|
||||
icon: IconProp.Key,
|
||||
tableDescription: 'Stores fingerprints for alert deduplication',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertFingerprint',
|
||||
})
|
||||
export default class AlertFingerprint extends BaseModel {
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// PROJECT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Entity,
|
||||
modelType: Project,
|
||||
title: 'Project',
|
||||
})
|
||||
@ManyToOne(() => Project, {
|
||||
onDelete: 'CASCADE',
|
||||
orphanedRowAction: 'delete',
|
||||
})
|
||||
@JoinColumn({ name: 'projectId' })
|
||||
public project?: Project = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// FINGERPRINT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Fingerprint',
|
||||
description: 'SHA-256 hash of the alert fields',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 64, // SHA-256 hex length
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public fingerprint?: string = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Fingerprint Fields',
|
||||
description: 'Fields used to compute this fingerprint',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: false,
|
||||
})
|
||||
public fingerprintFields?: Array<string> = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// CANONICAL ALERT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Entity,
|
||||
modelType: Alert,
|
||||
title: 'Canonical Alert',
|
||||
description: 'The original alert this fingerprint refers to',
|
||||
})
|
||||
@ManyToOne(() => Alert, {
|
||||
onDelete: 'CASCADE',
|
||||
orphanedRowAction: 'delete',
|
||||
})
|
||||
@JoinColumn({ name: 'canonicalAlertId' })
|
||||
public canonicalAlert?: Alert = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Canonical Alert ID',
|
||||
description: 'ID of the original alert',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public canonicalAlertId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DUPLICATE TRACKING
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Duplicate Count',
|
||||
description: 'Number of duplicate alerts suppressed',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public duplicateCount?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Last Duplicate At',
|
||||
description: 'When the last duplicate was received',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: true,
|
||||
})
|
||||
public lastDuplicateAt?: Date = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// TIME WINDOW
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Window Start',
|
||||
description: 'When this deduplication window started',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
public windowStartAt?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Window End',
|
||||
description: 'When this deduplication window expires',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public windowEndAt?: Date = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Alert Model Enhancements
|
||||
|
||||
Add deduplication fields to existing Alert model.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/Alert.ts` (modifications)
|
||||
|
||||
```typescript
|
||||
// Add these fields to the existing Alert model:
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DEDUPLICATION FIELDS (NEW)
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Fingerprint',
|
||||
description: 'SHA-256 fingerprint hash for deduplication',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 64,
|
||||
nullable: true,
|
||||
})
|
||||
@Index()
|
||||
public fingerprint?: string = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Duplicate Count',
|
||||
description: 'Number of duplicate alerts that were suppressed',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public duplicateCount?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Last Duplicate At',
|
||||
description: 'When the last duplicate occurred',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: true,
|
||||
})
|
||||
public lastDuplicateAt?: Date = undefined;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. DeduplicationConfig (Project Settings)
|
||||
|
||||
Add deduplication settings to Project or create separate settings model.
|
||||
|
||||
**Option A: Add to Project model**
|
||||
|
||||
```typescript
|
||||
// In Project model, add:
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Deduplication Config',
|
||||
description: 'Alert deduplication settings for this project',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: true,
|
||||
})
|
||||
public alertDeduplicationConfig?: DeduplicationConfig = undefined;
|
||||
```
|
||||
|
||||
**Option B: Separate AlertDeduplicationConfig model**
|
||||
|
||||
```typescript
|
||||
@TableMetadata({
|
||||
tableName: 'AlertDeduplicationConfig',
|
||||
singularName: 'Deduplication Config',
|
||||
pluralName: 'Deduplication Configs',
|
||||
icon: IconProp.Settings,
|
||||
tableDescription: 'Project-level deduplication settings',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertDeduplicationConfig',
|
||||
})
|
||||
export default class AlertDeduplicationConfig extends BaseModel {
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index({ unique: true })
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Boolean,
|
||||
title: 'Enabled',
|
||||
description: 'Whether deduplication is enabled',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Boolean,
|
||||
nullable: false,
|
||||
default: true,
|
||||
})
|
||||
public enabled?: boolean = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Window Minutes',
|
||||
description: 'Time window for deduplication (minutes)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 60,
|
||||
})
|
||||
public windowMinutes?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Fingerprint Fields',
|
||||
description: 'Fields to include in fingerprint',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: false,
|
||||
default: "['monitorId', 'createdCriteriaId', 'alertSeverityId', 'title']",
|
||||
})
|
||||
public fingerprintFields?: Array<string> = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Boolean,
|
||||
title: 'Normalize Strings',
|
||||
description: 'Whether to normalize strings (lowercase, trim)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Boolean,
|
||||
nullable: false,
|
||||
default: true,
|
||||
})
|
||||
public normalizeStrings?: boolean = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Type Definitions
|
||||
|
||||
```typescript
|
||||
// /Common/Types/Alert/DeduplicationConfig.ts
|
||||
|
||||
export interface DeduplicationConfig {
|
||||
// Enable/disable deduplication
|
||||
enabled: boolean;
|
||||
|
||||
// Time window for deduplication (minutes)
|
||||
windowMinutes: number;
|
||||
|
||||
// Fields to include in fingerprint
|
||||
fingerprintFields: Array<string>;
|
||||
|
||||
// Whether to normalize strings (lowercase, trim)
|
||||
normalizeStrings: boolean;
|
||||
}
|
||||
|
||||
export const DEFAULT_DEDUPLICATION_CONFIG: DeduplicationConfig = {
|
||||
enabled: true,
|
||||
windowMinutes: 60,
|
||||
fingerprintFields: ['monitorId', 'createdCriteriaId', 'alertSeverityId', 'title'],
|
||||
normalizeStrings: true,
|
||||
};
|
||||
|
||||
export const AVAILABLE_FINGERPRINT_FIELDS: Array<{
|
||||
field: string;
|
||||
label: string;
|
||||
description: string;
|
||||
}> = [
|
||||
{
|
||||
field: 'monitorId',
|
||||
label: 'Monitor',
|
||||
description: 'Include monitor in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'createdCriteriaId',
|
||||
label: 'Criteria',
|
||||
description: 'Include alert criteria in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'alertSeverityId',
|
||||
label: 'Severity',
|
||||
description: 'Include severity in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'title',
|
||||
label: 'Title',
|
||||
description: 'Include alert title in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'description',
|
||||
label: 'Description',
|
||||
description: 'Include alert description in fingerprint',
|
||||
},
|
||||
];
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Indexes
|
||||
|
||||
```sql
|
||||
-- AlertFingerprint indexes
|
||||
CREATE INDEX idx_fingerprint_lookup
|
||||
ON "AlertFingerprint" ("projectId", "fingerprint", "windowEndAt");
|
||||
|
||||
CREATE INDEX idx_fingerprint_cleanup
|
||||
ON "AlertFingerprint" ("windowEndAt");
|
||||
|
||||
CREATE INDEX idx_fingerprint_alert
|
||||
ON "AlertFingerprint" ("canonicalAlertId");
|
||||
|
||||
-- Alert fingerprint index
|
||||
CREATE INDEX idx_alert_fingerprint
|
||||
ON "Alert" ("projectId", "fingerprint")
|
||||
WHERE "fingerprint" IS NOT NULL;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
- [ ] Create AlertFingerprint model
|
||||
- [ ] Add fingerprint fields to Alert model
|
||||
- [ ] Create DeduplicationConfig type
|
||||
- [ ] Add config to Project model (or create separate model)
|
||||
- [ ] Register models in model registry
|
||||
- [ ] Create database migrations
|
||||
- [ ] Add indexes
|
||||
- [ ] Update API permissions
|
||||
667
Docs/Plan/AlertDeduplication/2-Backend.md
Normal file
667
Docs/Plan/AlertDeduplication/2-Backend.md
Normal file
@@ -0,0 +1,667 @@
|
||||
# Backend Implementation for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the backend services and components required for Alert Deduplication functionality.
|
||||
|
||||
## Core Components
|
||||
|
||||
### 1. FingerprintGenerator
|
||||
|
||||
Generates unique fingerprints for alerts based on configurable fields.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/FingerprintGenerator.ts`
|
||||
|
||||
```typescript
|
||||
import Alert from '../../Models/DatabaseModels/Alert';
|
||||
import crypto from 'crypto';
|
||||
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
|
||||
|
||||
export default class FingerprintGenerator {
|
||||
/**
|
||||
* Default fields used for fingerprinting
|
||||
*/
|
||||
public static DEFAULT_FIELDS: Array<string> = [
|
||||
'monitorId',
|
||||
'createdCriteriaId',
|
||||
'alertSeverityId',
|
||||
'title',
|
||||
];
|
||||
|
||||
/**
|
||||
* Generate a fingerprint hash for an alert
|
||||
*/
|
||||
public static generate(
|
||||
alert: Partial<Alert>,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): string {
|
||||
const fields = config?.fingerprintFields || this.DEFAULT_FIELDS;
|
||||
const normalizeStrings = config?.normalizeStrings ?? true;
|
||||
|
||||
const values: Array<string> = [];
|
||||
|
||||
for (const field of fields) {
|
||||
let value = this.getFieldValue(alert, field);
|
||||
|
||||
if (normalizeStrings && typeof value === 'string') {
|
||||
value = value.toLowerCase().trim();
|
||||
}
|
||||
|
||||
values.push(`${field}:${value}`);
|
||||
}
|
||||
|
||||
const fingerprintInput = values.join('|');
|
||||
|
||||
return crypto
|
||||
.createHash('sha256')
|
||||
.update(fingerprintInput)
|
||||
.digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a field value from an alert object
|
||||
*/
|
||||
private static getFieldValue(alert: Partial<Alert>, field: string): string {
|
||||
switch (field) {
|
||||
case 'monitorId':
|
||||
return alert.monitorId?.toString() || '';
|
||||
|
||||
case 'createdCriteriaId':
|
||||
return alert.createdCriteriaId || '';
|
||||
|
||||
case 'alertSeverityId':
|
||||
case 'severity':
|
||||
return alert.alertSeverityId?.toString() || '';
|
||||
|
||||
case 'title':
|
||||
return alert.title || '';
|
||||
|
||||
case 'description':
|
||||
return alert.description || '';
|
||||
|
||||
case 'createdByProbeId':
|
||||
return alert.createdByProbeId?.toString() || '';
|
||||
|
||||
default:
|
||||
// Try to get from customFields
|
||||
if (alert.customFields && typeof alert.customFields === 'object') {
|
||||
const customValue = (alert.customFields as Record<string, unknown>)[field];
|
||||
return customValue?.toString() || '';
|
||||
}
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that all required fields are present for fingerprinting
|
||||
*/
|
||||
public static validateFields(
|
||||
alert: Partial<Alert>,
|
||||
fields: Array<string>
|
||||
): { valid: boolean; missingFields: Array<string> } {
|
||||
const missingFields: Array<string> = [];
|
||||
|
||||
for (const field of fields) {
|
||||
const value = this.getFieldValue(alert, field);
|
||||
if (!value) {
|
||||
missingFields.push(field);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
valid: missingFields.length === 0,
|
||||
missingFields,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two fingerprints
|
||||
*/
|
||||
public static areEqual(fingerprint1: string, fingerprint2: string): boolean {
|
||||
return fingerprint1 === fingerprint2;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. DeduplicationEngine
|
||||
|
||||
Handles the core deduplication logic.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/DeduplicationEngine.ts`
|
||||
|
||||
```typescript
|
||||
import Alert from '../../Models/DatabaseModels/Alert';
|
||||
import AlertFingerprint from '../../Models/DatabaseModels/AlertFingerprint';
|
||||
import AlertFingerprintService from '../../Services/AlertFingerprintService';
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import FingerprintGenerator from './FingerprintGenerator';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
|
||||
|
||||
export interface DeduplicationResult {
|
||||
isDuplicate: boolean;
|
||||
canonicalAlertId?: ObjectID;
|
||||
canonicalAlert?: Alert;
|
||||
duplicateCount?: number;
|
||||
fingerprint: string;
|
||||
}
|
||||
|
||||
export default class DeduplicationEngine {
|
||||
/**
|
||||
* Check if an alert is a duplicate of an existing alert
|
||||
*/
|
||||
public static async checkDuplicate(
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): Promise<DeduplicationResult> {
|
||||
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
|
||||
|
||||
// Generate fingerprint
|
||||
const fingerprint = FingerprintGenerator.generate(alertData, mergedConfig);
|
||||
|
||||
// Check if fingerprint exists in active window
|
||||
const existingFingerprint = await AlertFingerprintService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
fingerprint,
|
||||
windowEndAt: QueryHelper.greaterThan(new Date()),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
canonicalAlertId: true,
|
||||
duplicateCount: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (existingFingerprint) {
|
||||
// It's a duplicate - update counters
|
||||
const newDuplicateCount = (existingFingerprint.duplicateCount || 0) + 1;
|
||||
|
||||
await AlertFingerprintService.updateOneById({
|
||||
id: existingFingerprint.id!,
|
||||
data: {
|
||||
duplicateCount: newDuplicateCount,
|
||||
lastDuplicateAt: new Date(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Update the canonical alert's duplicate count
|
||||
await AlertService.updateOneById({
|
||||
id: existingFingerprint.canonicalAlertId!,
|
||||
data: {
|
||||
duplicateCount: newDuplicateCount,
|
||||
lastDuplicateAt: new Date(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Get the canonical alert for return
|
||||
const canonicalAlert = await AlertService.findOneById({
|
||||
id: existingFingerprint.canonicalAlertId!,
|
||||
select: {
|
||||
_id: true,
|
||||
title: true,
|
||||
alertNumber: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return {
|
||||
isDuplicate: true,
|
||||
canonicalAlertId: existingFingerprint.canonicalAlertId,
|
||||
canonicalAlert: canonicalAlert || undefined,
|
||||
duplicateCount: newDuplicateCount,
|
||||
fingerprint,
|
||||
};
|
||||
}
|
||||
|
||||
// Not a duplicate
|
||||
return {
|
||||
isDuplicate: false,
|
||||
fingerprint,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a new fingerprint for an alert
|
||||
*/
|
||||
public static async registerFingerprint(
|
||||
alert: Alert,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): Promise<AlertFingerprint> {
|
||||
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
|
||||
|
||||
const fingerprint = alert.fingerprint ||
|
||||
FingerprintGenerator.generate(alert, mergedConfig);
|
||||
|
||||
const now = new Date();
|
||||
const windowEnd = OneUptimeDate.addRemoveMinutes(
|
||||
now,
|
||||
mergedConfig.windowMinutes
|
||||
);
|
||||
|
||||
const fingerprintRecord = await AlertFingerprintService.create({
|
||||
data: {
|
||||
projectId: alert.projectId,
|
||||
fingerprint,
|
||||
fingerprintFields: mergedConfig.fingerprintFields,
|
||||
canonicalAlertId: alert.id,
|
||||
duplicateCount: 0,
|
||||
windowStartAt: now,
|
||||
windowEndAt: windowEnd,
|
||||
} as AlertFingerprint,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return fingerprintRecord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an alert through deduplication
|
||||
* Returns the alert to create (or null if duplicate)
|
||||
*/
|
||||
public static async processAlert(
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): Promise<{
|
||||
shouldCreate: boolean;
|
||||
alertData: Partial<Alert>;
|
||||
deduplicationResult: DeduplicationResult;
|
||||
}> {
|
||||
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
|
||||
|
||||
// Skip deduplication if disabled
|
||||
if (!mergedConfig.enabled) {
|
||||
const fingerprint = FingerprintGenerator.generate(alertData, mergedConfig);
|
||||
return {
|
||||
shouldCreate: true,
|
||||
alertData: { ...alertData, fingerprint },
|
||||
deduplicationResult: {
|
||||
isDuplicate: false,
|
||||
fingerprint,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Check for duplicate
|
||||
const result = await this.checkDuplicate(alertData, projectId, mergedConfig);
|
||||
|
||||
if (result.isDuplicate) {
|
||||
return {
|
||||
shouldCreate: false,
|
||||
alertData,
|
||||
deduplicationResult: result,
|
||||
};
|
||||
}
|
||||
|
||||
// Not a duplicate - add fingerprint to alert data
|
||||
return {
|
||||
shouldCreate: true,
|
||||
alertData: { ...alertData, fingerprint: result.fingerprint },
|
||||
deduplicationResult: result,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get deduplication statistics for a project
|
||||
*/
|
||||
public static async getStatistics(
|
||||
projectId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<{
|
||||
totalAlerts: number;
|
||||
uniqueAlerts: number;
|
||||
duplicateCount: number;
|
||||
deduplicationRate: number;
|
||||
}> {
|
||||
// Count total fingerprint records
|
||||
const fingerprints = await AlertFingerprintService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
windowStartAt: QueryHelper.between(startDate, endDate),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
duplicateCount: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const uniqueAlerts = fingerprints.length;
|
||||
const duplicateCount = fingerprints.reduce(
|
||||
(sum, fp) => sum + (fp.duplicateCount || 0),
|
||||
0
|
||||
);
|
||||
const totalAlerts = uniqueAlerts + duplicateCount;
|
||||
const deduplicationRate = totalAlerts > 0
|
||||
? (duplicateCount / totalAlerts) * 100
|
||||
: 0;
|
||||
|
||||
return {
|
||||
totalAlerts,
|
||||
uniqueAlerts,
|
||||
duplicateCount,
|
||||
deduplicationRate: Math.round(deduplicationRate * 100) / 100,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. AlertFingerprintService
|
||||
|
||||
Database service for AlertFingerprint model.
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertFingerprintService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import AlertFingerprint from '../Models/DatabaseModels/AlertFingerprint';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import QueryHelper from '../Types/Database/QueryHelper';
|
||||
|
||||
export class Service extends DatabaseService<AlertFingerprint> {
|
||||
public constructor() {
|
||||
super(AlertFingerprint);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up expired fingerprints
|
||||
*/
|
||||
public async cleanupExpired(): Promise<number> {
|
||||
const result = await this.deleteBy({
|
||||
query: {
|
||||
windowEndAt: QueryHelper.lessThan(new Date()),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active fingerprints for a project
|
||||
*/
|
||||
public async getActiveFingerprints(
|
||||
projectId: ObjectID
|
||||
): Promise<Array<AlertFingerprint>> {
|
||||
return await this.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
windowEndAt: QueryHelper.greaterThan(new Date()),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
fingerprint: true,
|
||||
canonicalAlertId: true,
|
||||
duplicateCount: true,
|
||||
windowEndAt: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the window for a fingerprint (if alert is still active)
|
||||
*/
|
||||
public async extendWindow(
|
||||
fingerprintId: ObjectID,
|
||||
newEndTime: Date
|
||||
): Promise<void> {
|
||||
await this.updateOneById({
|
||||
id: fingerprintId,
|
||||
data: {
|
||||
windowEndAt: newEndTime,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Integration with AlertService
|
||||
|
||||
Modify AlertService to use deduplication.
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertService.ts` (modifications)
|
||||
|
||||
```typescript
|
||||
import DeduplicationEngine from '../Utils/Alert/DeduplicationEngine';
|
||||
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
|
||||
|
||||
// In onBeforeCreate():
|
||||
protected async onBeforeCreate(
|
||||
createBy: CreateBy<Alert>
|
||||
): Promise<OnCreate<Alert>> {
|
||||
// ... existing code ...
|
||||
|
||||
// Get deduplication config for project
|
||||
const deduplicationConfig = await this.getDeduplicationConfig(
|
||||
createBy.data.projectId!
|
||||
);
|
||||
|
||||
// Process through deduplication engine
|
||||
const deduplicationResult = await DeduplicationEngine.processAlert(
|
||||
createBy.data,
|
||||
createBy.data.projectId!,
|
||||
deduplicationConfig
|
||||
);
|
||||
|
||||
if (!deduplicationResult.shouldCreate) {
|
||||
// This is a duplicate - don't create
|
||||
throw new DuplicateAlertException(
|
||||
`Duplicate of alert #${deduplicationResult.deduplicationResult.canonicalAlert?.alertNumber}`,
|
||||
deduplicationResult.deduplicationResult.canonicalAlertId!
|
||||
);
|
||||
}
|
||||
|
||||
// Add fingerprint to alert data
|
||||
createBy.data.fingerprint = deduplicationResult.alertData.fingerprint;
|
||||
|
||||
// ... rest of existing code ...
|
||||
}
|
||||
|
||||
// In onCreateSuccess():
|
||||
protected async onCreateSuccess(
|
||||
onCreate: OnCreate<Alert>,
|
||||
createdItem: Alert
|
||||
): Promise<Alert> {
|
||||
// ... existing code ...
|
||||
|
||||
// Register fingerprint for deduplication
|
||||
const deduplicationConfig = await this.getDeduplicationConfig(
|
||||
createdItem.projectId!
|
||||
);
|
||||
|
||||
if (deduplicationConfig.enabled) {
|
||||
await DeduplicationEngine.registerFingerprint(
|
||||
createdItem,
|
||||
deduplicationConfig
|
||||
);
|
||||
}
|
||||
|
||||
// ... rest of existing code ...
|
||||
}
|
||||
|
||||
// Helper method:
|
||||
private async getDeduplicationConfig(
|
||||
projectId: ObjectID
|
||||
): Promise<DeduplicationConfig> {
|
||||
const project = await ProjectService.findOneById({
|
||||
id: projectId,
|
||||
select: { alertDeduplicationConfig: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return project?.alertDeduplicationConfig || DEFAULT_DEDUPLICATION_CONFIG;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. DuplicateAlertException
|
||||
|
||||
Custom exception for duplicate alerts.
|
||||
|
||||
**File Location:** `/Common/Types/Exception/DuplicateAlertException.ts`
|
||||
|
||||
```typescript
|
||||
import Exception from './Exception';
|
||||
import ExceptionCode from './ExceptionCode';
|
||||
import ObjectID from '../ObjectID';
|
||||
|
||||
export default class DuplicateAlertException extends Exception {
|
||||
public canonicalAlertId: ObjectID;
|
||||
|
||||
public constructor(message: string, canonicalAlertId: ObjectID) {
|
||||
super(ExceptionCode.DuplicateAlertException, message);
|
||||
this.canonicalAlertId = canonicalAlertId;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Worker Jobs
|
||||
|
||||
### 1. FingerprintCleanup Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertDeduplication/FingerprintCleanup.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_HOUR } from 'Common/Utils/CronTime';
|
||||
import AlertFingerprintService from 'Common/Server/Services/AlertFingerprintService';
|
||||
|
||||
RunCron(
|
||||
'AlertDeduplication:FingerprintCleanup',
|
||||
{ schedule: EVERY_HOUR, runOnStartup: false },
|
||||
async () => {
|
||||
const deletedCount = await AlertFingerprintService.cleanupExpired();
|
||||
|
||||
if (deletedCount > 0) {
|
||||
logger.info(`Cleaned up ${deletedCount} expired fingerprints`);
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Redis Caching (Optional Enhancement)
|
||||
|
||||
For high-throughput systems, cache fingerprints in Redis.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/FingerprintCache.ts`
|
||||
|
||||
```typescript
|
||||
import Redis from '../../Infrastructure/Redis';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
|
||||
export default class FingerprintCache {
|
||||
private static CACHE_PREFIX = 'alert:fingerprint:';
|
||||
private static DEFAULT_TTL_SECONDS = 3600; // 1 hour
|
||||
|
||||
/**
|
||||
* Get a cached fingerprint
|
||||
*/
|
||||
public static async get(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string
|
||||
): Promise<{ canonicalAlertId: string; duplicateCount: number } | null> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
const value = await Redis.get(key);
|
||||
|
||||
if (!value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return JSON.parse(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a fingerprint in cache
|
||||
*/
|
||||
public static async set(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string,
|
||||
data: { canonicalAlertId: string; duplicateCount: number },
|
||||
ttlSeconds: number = this.DEFAULT_TTL_SECONDS
|
||||
): Promise<void> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
await Redis.setex(key, ttlSeconds, JSON.stringify(data));
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment duplicate count in cache
|
||||
*/
|
||||
public static async incrementDuplicateCount(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string
|
||||
): Promise<number> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
const countKey = `${key}:count`;
|
||||
return await Redis.incr(countKey);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a fingerprint from cache
|
||||
*/
|
||||
public static async delete(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string
|
||||
): Promise<void> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
await Redis.del(key);
|
||||
}
|
||||
|
||||
private static buildKey(projectId: ObjectID, fingerprint: string): string {
|
||||
return `${this.CACHE_PREFIX}${projectId.toString()}:${fingerprint}`;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Core Components
|
||||
- [ ] Create FingerprintGenerator utility
|
||||
- [ ] Create DeduplicationEngine
|
||||
- [ ] Create AlertFingerprintService
|
||||
- [ ] Create DuplicateAlertException
|
||||
|
||||
### Phase 2: Integration
|
||||
- [ ] Modify AlertService.onBeforeCreate()
|
||||
- [ ] Modify AlertService.onCreateSuccess()
|
||||
- [ ] Add fingerprint fields to Alert model
|
||||
- [ ] Create AlertFingerprint model
|
||||
|
||||
### Phase 3: Background Jobs
|
||||
- [ ] Create FingerprintCleanup job
|
||||
- [ ] Register job in worker
|
||||
|
||||
### Phase 4: Testing
|
||||
- [ ] Unit tests for FingerprintGenerator
|
||||
- [ ] Unit tests for DeduplicationEngine
|
||||
- [ ] Integration tests for deduplication flow
|
||||
- [ ] Performance tests for high-volume scenarios
|
||||
|
||||
### Phase 5: Optional Enhancements
|
||||
- [ ] Redis caching for fingerprints
|
||||
- [ ] Configurable fingerprint fields per project
|
||||
- [ ] Deduplication analytics API
|
||||
287
Docs/Plan/AlertDeduplication/3-API.md
Normal file
287
Docs/Plan/AlertDeduplication/3-API.md
Normal file
@@ -0,0 +1,287 @@
|
||||
# API Design for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Deduplication functionality.
|
||||
|
||||
## Deduplication Configuration API
|
||||
|
||||
### Get Deduplication Config
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-deduplication-config
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"windowMinutes": 60,
|
||||
"fingerprintFields": ["monitorId", "createdCriteriaId", "alertSeverityId", "title"],
|
||||
"normalizeStrings": true
|
||||
}
|
||||
```
|
||||
|
||||
### Update Deduplication Config
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-deduplication-config
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"windowMinutes": 120,
|
||||
"fingerprintFields": ["monitorId", "alertSeverityId", "title"],
|
||||
"normalizeStrings": true
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deduplication Statistics API
|
||||
|
||||
### Get Deduplication Statistics
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-deduplication-stats
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"period": {
|
||||
"startDate": "2026-01-13T00:00:00Z",
|
||||
"endDate": "2026-01-20T00:00:00Z"
|
||||
},
|
||||
"totalAlerts": 5000,
|
||||
"uniqueAlerts": 2500,
|
||||
"duplicateCount": 2500,
|
||||
"deduplicationRate": 50.0,
|
||||
"topDuplicatedAlerts": [
|
||||
{
|
||||
"alertId": "alert-1",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"duplicateCount": 150,
|
||||
"monitor": { "name": "mysql-prod" }
|
||||
},
|
||||
{
|
||||
"alertId": "alert-2",
|
||||
"alertTitle": "API latency high",
|
||||
"duplicateCount": 89,
|
||||
"monitor": { "name": "api-gateway" }
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Fingerprint API
|
||||
|
||||
### List Active Fingerprints
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-fingerprint
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "fingerprint-1",
|
||||
"fingerprint": "a1b2c3d4...",
|
||||
"canonicalAlert": {
|
||||
"_id": "alert-1",
|
||||
"alertNumber": 123,
|
||||
"title": "MySQL connection timeout"
|
||||
},
|
||||
"duplicateCount": 15,
|
||||
"lastDuplicateAt": "2026-01-20T10:45:00Z",
|
||||
"windowEndAt": "2026-01-20T11:00:00Z"
|
||||
}
|
||||
],
|
||||
"count": 50
|
||||
}
|
||||
```
|
||||
|
||||
### Get Fingerprint Details
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-fingerprint/{fingerprintId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Response Enhancement
|
||||
|
||||
The Alert response now includes deduplication fields:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "alert-1",
|
||||
"alertNumber": 123,
|
||||
"title": "MySQL connection timeout",
|
||||
"fingerprint": "a1b2c3d4e5f6...",
|
||||
"duplicateCount": 15,
|
||||
"lastDuplicateAt": "2026-01-20T10:45:00Z",
|
||||
"// ... other fields"
|
||||
}
|
||||
```
|
||||
|
||||
### Filter Alerts by Duplicate Count
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert?duplicateCount.gt=10
|
||||
```
|
||||
|
||||
Get alerts with more than 10 duplicates.
|
||||
|
||||
---
|
||||
|
||||
## Available Fingerprint Fields API
|
||||
|
||||
### Get Available Fields
|
||||
|
||||
```http
|
||||
GET /api/alert-deduplication-config/available-fields
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"field": "monitorId",
|
||||
"label": "Monitor",
|
||||
"description": "Include monitor in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "createdCriteriaId",
|
||||
"label": "Criteria",
|
||||
"description": "Include alert criteria in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "alertSeverityId",
|
||||
"label": "Severity",
|
||||
"description": "Include severity in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "title",
|
||||
"label": "Title",
|
||||
"description": "Include alert title in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "description",
|
||||
"label": "Description",
|
||||
"description": "Include alert description in fingerprint"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Test Fingerprint API
|
||||
|
||||
### Generate Test Fingerprint
|
||||
|
||||
Test what fingerprint would be generated for given alert data.
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-deduplication-config/test
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertData": {
|
||||
"monitorId": "monitor-1",
|
||||
"alertSeverityId": "severity-1",
|
||||
"title": "MySQL connection timeout"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"fingerprint": "a1b2c3d4e5f6...",
|
||||
"fieldsUsed": ["monitorId", "alertSeverityId", "title"],
|
||||
"fieldValues": {
|
||||
"monitorId": "monitor-1",
|
||||
"alertSeverityId": "severity-1",
|
||||
"title": "mysql connection timeout"
|
||||
},
|
||||
"wouldBeDuplicateOf": {
|
||||
"alertId": "alert-123",
|
||||
"alertNumber": 123,
|
||||
"alertTitle": "MySQL connection timeout"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "DUPLICATE_ALERT",
|
||||
"message": "Duplicate of alert #123",
|
||||
"data": {
|
||||
"canonicalAlertId": "alert-123",
|
||||
"canonicalAlertNumber": 123,
|
||||
"duplicateCount": 16
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: This is typically not shown to users as duplicates are handled silently.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Configuration API
|
||||
- [ ] GET /alert-deduplication-config
|
||||
- [ ] PUT /alert-deduplication-config
|
||||
- [ ] GET /alert-deduplication-config/available-fields
|
||||
- [ ] POST /alert-deduplication-config/test
|
||||
|
||||
### Statistics API
|
||||
- [ ] GET /alert-deduplication-stats
|
||||
|
||||
### Fingerprint API
|
||||
- [ ] GET /alert-fingerprint (list)
|
||||
- [ ] GET /alert-fingerprint/:id (details)
|
||||
|
||||
### Alert API Updates
|
||||
- [ ] Add fingerprint to response
|
||||
- [ ] Add duplicateCount to response
|
||||
- [ ] Add duplicateCount filter
|
||||
259
Docs/Plan/AlertDeduplication/4-UI.md
Normal file
259
Docs/Plan/AlertDeduplication/4-UI.md
Normal file
@@ -0,0 +1,259 @@
|
||||
# UI Implementation for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Deduplication functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
└── Settings
|
||||
└── Alerts
|
||||
├── Alert States (existing)
|
||||
├── Alert Severities (existing)
|
||||
├── Grouping Rules
|
||||
├── Suppression Rules
|
||||
└── Deduplication (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Deduplication Settings Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertDeduplication.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-deduplication`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Alert Deduplication │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Deduplication prevents duplicate alerts from being created. When a duplicate │ │
|
||||
│ │ alert is detected, it increments the count on the original alert instead. │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ DEDUPLICATION STATUS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Deduplication is ENABLED [Disable] │ │
|
||||
│ └──────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ STATISTICS (Last 7 Days) │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
|
||||
│ │ Total Alerts │ │ Unique Alerts │ │ Deduplicated │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ 5,000 │ │ 2,500 │ │ 2,500 │ │
|
||||
│ │ │ │ │ │ (50%) │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
|
||||
│ │
|
||||
│ CONFIGURATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Deduplication Window │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 60 │ minutes │
|
||||
│ └──────────┘ │
|
||||
│ Alerts with the same fingerprint within this window are considered duplicates. │
|
||||
│ │
|
||||
│ Fingerprint Fields │
|
||||
│ Select which fields to include when computing the alert fingerprint: │
|
||||
│ │
|
||||
│ ☑ Monitor - Include monitor in fingerprint │
|
||||
│ ☑ Criteria - Include alert criteria in fingerprint │
|
||||
│ ☑ Severity - Include severity level in fingerprint │
|
||||
│ ☑ Title - Include alert title in fingerprint │
|
||||
│ ☐ Description - Include alert description in fingerprint │
|
||||
│ │
|
||||
│ String Normalization │
|
||||
│ ☑ Normalize strings (convert to lowercase, trim whitespace) │
|
||||
│ │
|
||||
│ [Save Changes] │
|
||||
│ │
|
||||
│ TEST FINGERPRINT │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Test what fingerprint would be generated for an alert: │
|
||||
│ │
|
||||
│ Monitor Severity Title │
|
||||
│ ┌───────────────┐ ┌───────────────┐ ┌─────────────────────────────────────────┐ │
|
||||
│ │ mysql-prod [▼]│ │ Critical [▼] │ │ Connection timeout │ │
|
||||
│ └───────────────┘ └───────────────┘ └─────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [Generate Fingerprint] │
|
||||
│ │
|
||||
│ Result: │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Fingerprint: a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6 │ │
|
||||
│ │ This would be a DUPLICATE of Alert #123: "MySQL connection timeout" │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Alert Detail Enhancement
|
||||
|
||||
Add deduplication info to Alert detail page.
|
||||
|
||||
**Wireframe Addition:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alert #123: MySQL connection timeout │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────┐ │
|
||||
│ │ DEDUPLICATION INFO │ │
|
||||
│ │ ────────────────────────────────────────── │ │
|
||||
│ │ │ │
|
||||
│ │ 🔢 Duplicate Count: 15 │ │
|
||||
│ │ This alert represents 16 total │ │
|
||||
│ │ occurrences (1 original + 15 dupes) │ │
|
||||
│ │ │ │
|
||||
│ │ 🕐 Last Duplicate: 10 minutes ago │ │
|
||||
│ │ │ │
|
||||
│ │ 🔑 Fingerprint: │ │
|
||||
│ │ a1b2c3d4e5f6... [Copy]│ │
|
||||
│ └──────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ // ... rest of alert details │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Alerts Table Enhancement
|
||||
|
||||
Add duplicate count column to alerts table.
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────┬──────────┬───────┬──────┬────────────┐│
|
||||
│ │ ID │ Title │ Severity │ Dupes │ State│ Age ││
|
||||
│ ├───────┼──────────────────────────────────┼──────────┼───────┼──────┼────────────┤│
|
||||
│ │ #127 │ MySQL connection timeout │ Critical │ x15 │ ● │ 2m ││
|
||||
│ │ #126 │ Disk space low │ Warning │ x3 │ ● │ 15m ││
|
||||
│ │ #125 │ API response slow │ High │ — │ ✓ │ 1h ││
|
||||
│ │ #124 │ Memory usage high │ Warning │ x47 │ ● │ 2h ││
|
||||
│ └───────┴──────────────────────────────────┴──────────┴───────┴──────┴────────────┘│
|
||||
│ │
|
||||
│ Dupes = Number of duplicate alerts suppressed │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. DeduplicationStatsCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/DeduplicationStatsCard.tsx`
|
||||
|
||||
Shows deduplication statistics in a card format.
|
||||
|
||||
```typescript
|
||||
interface DeduplicationStatsCardProps {
|
||||
totalAlerts: number;
|
||||
uniqueAlerts: number;
|
||||
duplicateCount: number;
|
||||
deduplicationRate: number;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. FingerprintFieldSelector
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/FingerprintFieldSelector.tsx`
|
||||
|
||||
Checkbox list for selecting fingerprint fields.
|
||||
|
||||
```typescript
|
||||
interface FingerprintFieldSelectorProps {
|
||||
selectedFields: Array<string>;
|
||||
onChange: (fields: Array<string>) => void;
|
||||
availableFields: Array<{
|
||||
field: string;
|
||||
label: string;
|
||||
description: string;
|
||||
}>;
|
||||
}
|
||||
```
|
||||
|
||||
### 3. FingerprintTester
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/FingerprintTester.tsx`
|
||||
|
||||
Form for testing fingerprint generation.
|
||||
|
||||
### 4. DuplicateCountBadge
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/DuplicateCountBadge.tsx`
|
||||
|
||||
Badge showing duplicate count.
|
||||
|
||||
```typescript
|
||||
interface DuplicateCountBadgeProps {
|
||||
count: number;
|
||||
showIfZero?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 5. DeduplicationInfoCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/DeduplicationInfoCard.tsx`
|
||||
|
||||
Card for alert detail page showing deduplication info.
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to route configuration:
|
||||
|
||||
```typescript
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-deduplication',
|
||||
component: AlertDeduplicationPage,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Deduplication settings page
|
||||
|
||||
### Components
|
||||
- [ ] DeduplicationStatsCard
|
||||
- [ ] FingerprintFieldSelector
|
||||
- [ ] FingerprintTester
|
||||
- [ ] DuplicateCountBadge
|
||||
- [ ] DeduplicationInfoCard
|
||||
|
||||
### Existing Page Updates
|
||||
- [ ] Add duplicate count column to Alerts table
|
||||
- [ ] Add deduplication info to Alert detail page
|
||||
- [ ] Add sidebar navigation item
|
||||
|
||||
### Styling
|
||||
- [ ] Stats card styles
|
||||
- [ ] Badge styles
|
||||
- [ ] Field selector styles
|
||||
165
Docs/Plan/AlertDeduplication/README.md
Normal file
165
Docs/Plan/AlertDeduplication/README.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# Alert Deduplication Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Deduplication and Fingerprinting functionality for OneUptime. This feature prevents duplicate alerts from being created and tracks duplicate occurrences.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and deduplication engine |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is Alert Deduplication?
|
||||
|
||||
Alert Deduplication prevents the same alert from being created multiple times within a configurable time window. Instead of creating duplicate alerts, the system increments a counter on the original alert.
|
||||
|
||||
### How Fingerprinting Works
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alert Fingerprint Generation │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
Alert Data Fingerprint Fields Hash
|
||||
┌─────────────────┐ ┌───────────────────┐ ┌────────────┐
|
||||
│ monitorId: abc │ │ monitorId: abc │ │ │
|
||||
│ criteriaId: xyz │ ──► │ criteriaId: xyz │ ──► │ SHA-256 │
|
||||
│ severity: high │ │ severity: high │ │ = a1b2c3.. │
|
||||
│ title: "Error" │ │ title: "Error" │ │ │
|
||||
│ time: 10:00 AM │ │ (time excluded) │ │ │
|
||||
└─────────────────┘ └───────────────────┘ └────────────┘
|
||||
```
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Fingerprint Generation** - Compute unique hash from alert fields
|
||||
2. **Time-Window Deduplication** - Deduplicate within configurable window
|
||||
3. **Duplicate Counting** - Track how many duplicates were suppressed
|
||||
4. **Configurable Fields** - Choose which fields to include in fingerprint
|
||||
5. **Per-Project Settings** - Customize deduplication per project
|
||||
|
||||
### Benefits
|
||||
|
||||
| Without Deduplication | With Deduplication |
|
||||
|-----------------------|-------------------|
|
||||
| 100 identical alerts created | 1 alert with count: 100 |
|
||||
| 100 notifications sent | 1 notification sent |
|
||||
| Alert fatigue | Reduced noise |
|
||||
| Storage waste | Efficient storage |
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an operator, I want duplicate alerts to be automatically merged
|
||||
so that I don't see the same alert repeated 50 times.
|
||||
|
||||
As a team lead, I want to know how many times an alert occurred
|
||||
so that I can understand the severity of the issue.
|
||||
|
||||
As an SRE, I want to configure the deduplication window
|
||||
so that I can tune it for my team's workflow.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Core Fingerprinting (Week 1)
|
||||
|
||||
- [ ] Create FingerprintGenerator utility
|
||||
- [ ] Add fingerprint field to Alert model
|
||||
- [ ] Implement basic SHA-256 fingerprinting
|
||||
- [ ] Add duplicate count field to Alert
|
||||
|
||||
### Phase 2: Deduplication Engine (Week 2)
|
||||
|
||||
- [ ] Create AlertFingerprint cache model
|
||||
- [ ] Implement DeduplicationEngine
|
||||
- [ ] Integrate with AlertService
|
||||
- [ ] Add time-window support
|
||||
|
||||
### Phase 3: Configuration & UI (Week 3)
|
||||
|
||||
- [ ] Add project-level deduplication settings
|
||||
- [ ] Create deduplication configuration UI
|
||||
- [ ] Add duplicate count to Alert detail view
|
||||
- [ ] Add deduplication metrics
|
||||
|
||||
### Phase 4: Advanced Features (Week 4)
|
||||
|
||||
- [ ] Configurable fingerprint fields
|
||||
- [ ] Redis caching for fingerprints
|
||||
- [ ] Deduplication analytics dashboard
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Deduplication Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────┐
|
||||
│ Alert Trigger │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ FingerprintGenerator │
|
||||
│ .generate() │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ DeduplicationEngine │
|
||||
│ .checkDuplicate() │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
┌────────────────┴────────────────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ DUPLICATE │ │ NEW │
|
||||
│ │ │ │
|
||||
│ - Increment │ │ - Create alert │
|
||||
│ count on │ │ - Register │
|
||||
│ original │ │ fingerprint │
|
||||
│ - Skip creation │ │ - Send notifs │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```typescript
|
||||
interface DeduplicationConfig {
|
||||
// Enable/disable deduplication
|
||||
enabled: boolean;
|
||||
|
||||
// Time window for deduplication (minutes)
|
||||
windowMinutes: number; // Default: 60
|
||||
|
||||
// Fields to include in fingerprint
|
||||
fingerprintFields: Array<string>; // Default: ['monitorId', 'criteriaId', 'severity', 'title']
|
||||
|
||||
// Whether to normalize strings (lowercase, trim)
|
||||
normalizeStrings: boolean; // Default: true
|
||||
}
|
||||
```
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Duplicate detection accuracy | > 99% |
|
||||
| Fingerprint generation time | < 5ms |
|
||||
| Storage reduction | 30-50% |
|
||||
| Notification reduction | 40-60% |
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Alert Grouping Plan](../AlertGrouping/README.md)
|
||||
- [Alert Suppression Plan](../AlertSuppression/README.md)
|
||||
319
Docs/Plan/AlertGrouping/Summary.md
Normal file
319
Docs/Plan/AlertGrouping/Summary.md
Normal file
@@ -0,0 +1,319 @@
|
||||
# Alert Grouping / Episodes - Summary
|
||||
|
||||
## What is Alert Grouping?
|
||||
|
||||
Alert Grouping is a feature that automatically combines related alerts into logical containers called **Episodes**. Instead of seeing 50 individual "connection timeout" alerts, operators see one episode: "Database Connectivity Issues (50 alerts)".
|
||||
|
||||
## Key Capabilities
|
||||
|
||||
1. **Automatic Grouping** - Rules-based grouping of alerts into episodes
|
||||
2. **Time-Window Grouping** - Group alerts occurring within N minutes
|
||||
3. **Field-Based Grouping** - Group by monitor, monitor custom fields, severity, labels, etc.
|
||||
4. **Manual Management** - Add/remove alerts from episodes (merge/split deferred to future)
|
||||
5. **Episode Lifecycle** - Active → Acknowledged → Resolved states. These should be linked to alert states.
|
||||
6. **Root Cause Tracking** - Document root cause analysis per episode. This is a placeholder field for user to fill out. We can even use Generate with AI to help summarize the episode based on Root Cause of all the alerts in the episode.
|
||||
7. **Flapping Prevention** - Grace periods before resolution and reopen windows
|
||||
|
||||
## Data Models
|
||||
|
||||
### Three New Models
|
||||
|
||||
| Model | Purpose |
|
||||
|-------|---------|
|
||||
| **AlertEpisode** | Container for grouped alerts (title, state, severity, timing, ownership) |
|
||||
| **AlertEpisodeMember** | Links alerts to episodes with metadata (addedBy, addedAt) |
|
||||
| **AlertGroupingRule** | Configures automatic grouping behavior (match criteria, grouping config, priority) |
|
||||
|
||||
### Alert Model Enhancements
|
||||
|
||||
- `episodeId` - Link to parent episode (nullable)
|
||||
|
||||
## Grouping Types
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| **Time Window** | Groups alerts within N minutes of each other |
|
||||
| **Field-Based** | Groups by matching fields (monitor, severity, labels) |
|
||||
| **Smart** | ML-based similarity matching (future) |
|
||||
|
||||
## Flapping Prevention
|
||||
|
||||
- **resolveDelayMinutes** - Grace period before auto-resolving (prevents rapid state changes)
|
||||
- **reopenWindowMinutes** - Window after resolution where episode can be reopened instead of creating new
|
||||
|
||||
## On-Call Policy Resolution
|
||||
|
||||
Priority chain for notifications:
|
||||
1. Grouping rule's is linked to on-call policy. When episode is created via a grouping rule, that rule's on-call policy is used.
|
||||
2. If alert has any on-call policy. Please use it as well along with grouping rule's on-call policy.
|
||||
3. If neither the grouping rule nor alert has an on-call policy, no notifications are sent.
|
||||
|
||||
When an alert joins an episode, the alert policy (if any) is executed as normal. The episode's on-call policy is also executed. This means that if an alert has an on-call policy, notifications may be sent twice - once for the alert and once for the episode. If the episode policy is executed and then a new alert joins the episode, the episode's on-call policy is NOT re-executed.
|
||||
|
||||
### Worker Jobs
|
||||
- **EpisodeAutoResolve** - Resolves episodes when all alerts resolved
|
||||
- **EpisodeBreakInactive** - Resolves episodes after inactivity period
|
||||
|
||||
## Database Migrations
|
||||
|
||||
Please do not write Database migrations. I will do that manually.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Q&A (Industry Best Practices)
|
||||
|
||||
### Episode State Management
|
||||
|
||||
**Q1: How should episode states link to alert states?**
|
||||
|
||||
The episode state should reflect the "worst" or most urgent state among its member alerts:
|
||||
- **Active**: At least one alert in the episode is in an active/firing state
|
||||
- **Acknowledged**: All active alerts have been acknowledged, but not yet resolved
|
||||
- **Resolved**: All alerts in the episode are resolved
|
||||
|
||||
This follows the pattern used by PagerDuty, Opsgenie, and other incident management platforms. The episode acts as an aggregate - it's only fully resolved when all underlying alerts are resolved.
|
||||
|
||||
If I Acknowledge an episode for exmaple, all active alerts in that episode should also be acknowledged. This ensures consistency between episode and alert states.
|
||||
|
||||
**Q2: If a new alert joins an already-acknowledged episode, should the episode state change back to Active?**
|
||||
|
||||
**No** - the acknowledgment applies to the episode as a container, not to individual alerts. When a new alert joins an acknowledged episode:
|
||||
- The episode remains in "Acknowledged" state
|
||||
- The new alert is marked as part of an acknowledged episode
|
||||
- No new notification is sent for the episode (since it's already acknowledged)
|
||||
- The alert's own on-call policy still executes normally
|
||||
|
||||
This prevents notification fatigue while ensuring the operator knows the episode is still being worked on.
|
||||
|
||||
---
|
||||
|
||||
### Grouping Logic
|
||||
|
||||
**Q3: For Time Window grouping - if an alert comes in after the initial window, should it create a new episode or join the existing one?**
|
||||
|
||||
Use a **rolling/sliding window** approach:
|
||||
- The time window refers to the **gap between consecutive alerts**, not from the first alert
|
||||
- If an episode is still **Active** and a matching alert arrives, it joins the episode regardless of when the first alert occurred
|
||||
- The time window is used to determine when an episode becomes "inactive" (no new alerts for N minutes)
|
||||
- Example: With a 10-minute window, alerts at T+0, T+8, T+15, T+22 would all be in the same episode (each gap < 10 min)
|
||||
|
||||
This is the standard approach in tools like PagerDuty's Intelligent Grouping and Opsgenie's Alert Deduplication.
|
||||
|
||||
**Q4: What fields can be matched in Field-Based grouping?**
|
||||
|
||||
Standard matchable fields should include:
|
||||
| Field | Description |
|
||||
|-------|-------------|
|
||||
| `monitorId` | Same monitor/service |
|
||||
| `monitorCustomFields` | User-defined monitor metadata |
|
||||
| `alertSeverity` | Critical, Warning, Info, etc. |
|
||||
| `labels` | Key-value tags on alerts |
|
||||
| `alertTitle` | Exact or pattern match on title |
|
||||
| `alertDescription` | Pattern match on description |
|
||||
| `telemetryQuery` | The query that triggered the alert |
|
||||
|
||||
Rules should support both exact matching and regex patterns for string fields.
|
||||
|
||||
**Q5: If multiple AlertGroupingRules match a single alert, which rule takes priority?**
|
||||
|
||||
Use explicit **priority ordering**:
|
||||
- Each rule has a `priority` field (lower number = higher priority)
|
||||
- The **first matching rule** (highest priority) wins
|
||||
- Only one rule processes each alert
|
||||
- If no rules match, the alert remains ungrouped (standalone)
|
||||
|
||||
This gives operators explicit control over rule precedence, similar to firewall rules or routing tables.
|
||||
|
||||
---
|
||||
|
||||
### Scope & Implementation
|
||||
|
||||
**Q8: Should we implement backend only or both backend and frontend?**
|
||||
|
||||
Backend and Frontend. Please do not implement any database migrations. I will do that manually.
|
||||
|
||||
**Q9: What existing patterns in the codebase should we follow?**
|
||||
|
||||
Look at these existing features for patterns:
|
||||
- **Alert model and workflows** - Base patterns for state management
|
||||
- **Incident management** (if exists) - Similar grouping/aggregation concepts
|
||||
- **On-Call Policy execution** - Notification routing patterns
|
||||
- **Scheduled Jobs/Workers** - Pattern for background job implementation
|
||||
- **CRUD APIs** - Standard API patterns for the new models
|
||||
|
||||
---
|
||||
|
||||
### Worker Jobs
|
||||
|
||||
**Q10: What should the inactivity period be for EpisodeBreakInactive?**
|
||||
|
||||
Make it **configurable per rule** with sensible defaults:
|
||||
- **Default**: 60 minutes of inactivity
|
||||
- **Configurable range**: 5 minutes to 24 hours
|
||||
- **Per-rule setting**: `inactivityTimeoutMinutes` on `AlertGroupingRule`
|
||||
|
||||
The worker job should run every 1-5 minutes, checking for episodes that have exceeded their inactivity threshold.
|
||||
|
||||
| Scenario | Recommended Timeout |
|
||||
|----------|---------------------|
|
||||
| High-frequency alerts (metrics) | 5-15 minutes |
|
||||
| Standard monitoring | 30-60 minutes |
|
||||
| Low-frequency events | 2-4 hours |
|
||||
| Maintenance windows | 12-24 hours |
|
||||
|
||||
---
|
||||
|
||||
### Episode Title Generation
|
||||
|
||||
**Q11: When an episode is auto-created by a rule, how should the title be generated?**
|
||||
|
||||
**Recommendation**: Use a two-tier approach:
|
||||
1. **Default**: Use the first alert's title as the episode title
|
||||
2. **Optional override**: Allow a template on the `AlertGroupingRule` for custom naming
|
||||
|
||||
Template variables could include:
|
||||
- `{alertTitle}` - First alert's title
|
||||
- `{monitorName}` - Monitor/service name
|
||||
- `{alertSeverity}` - Severity level
|
||||
- `{alertCount}` - Number of alerts (updated dynamically)
|
||||
|
||||
Example templates:
|
||||
- `"{alertSeverity} issues on {monitorName}"` → "Critical issues on API Server"
|
||||
- `"{monitorName} - {alertTitle}"` → "Database - Connection timeout"
|
||||
|
||||
If no template is specified on the rule, default to the first alert's title.
|
||||
|
||||
---
|
||||
|
||||
### Manual Management
|
||||
|
||||
**Q12: If an alert is removed from an episode, what happens to it?**
|
||||
|
||||
The alert becomes **standalone** (ungrouped). The user can then optionally move it to a different episode manually. The alert's `episodeId` is set to null.
|
||||
|
||||
---
|
||||
|
||||
### State Synchronization
|
||||
|
||||
**Q13: If I manually resolve an episode, should all member alerts also be resolved?**
|
||||
|
||||
**Yes** - resolving an episode resolves all member alerts. This mirrors the acknowledge behavior: episode state changes cascade down to all member alerts for consistency.
|
||||
|
||||
State cascade rules:
|
||||
- **Acknowledge episode** → Acknowledge all member alerts
|
||||
- **Resolve episode** → Resolve all member alerts
|
||||
|
||||
---
|
||||
|
||||
### Grouping Combinations
|
||||
|
||||
**Q14: Can a single rule use BOTH Time Window AND Field-Based grouping together?**
|
||||
|
||||
**Yes** - rules can combine both grouping types. For example:
|
||||
- "Group alerts from the **same monitor** that occur **within 10 minutes** of each other"
|
||||
- "Group alerts with the **same severity and labels** within a **30-minute window**"
|
||||
|
||||
Both conditions must be satisfied for alerts to be grouped together.
|
||||
|
||||
---
|
||||
|
||||
### Alert Eligibility
|
||||
|
||||
**Q15: Can only Active alerts be grouped into episodes, or can alerts in any state be grouped?**
|
||||
|
||||
Alerts in **any state** can be grouped into episodes. This allows:
|
||||
- Grouping historical alerts for post-incident analysis
|
||||
- Manual organization of already-resolved alerts
|
||||
- Flexibility in episode management regardless of alert lifecycle stage
|
||||
|
||||
---
|
||||
|
||||
### Episode Ownership
|
||||
|
||||
**Q16: What ownership fields should AlertEpisode have?**
|
||||
|
||||
- **Assigned User** - Individual user responsible for the episode
|
||||
- **Assigned Team** - Team responsible for the episode
|
||||
|
||||
Both are optional and can be set manually or inherited from the grouping rule configuration.
|
||||
|
||||
---
|
||||
|
||||
### Episode Severity
|
||||
|
||||
**Q17: How should episode severity be determined?**
|
||||
|
||||
Use a **high-water mark with manual override** approach:
|
||||
|
||||
1. **Initial**: Set to first alert's severity
|
||||
2. **Auto-escalate**: When a new alert joins, if its severity > current episode severity → update episode to higher severity
|
||||
3. **Never auto-downgrade**: If lower severity alert joins → keep current episode severity
|
||||
4. **Manual override allowed**: User can edit severity at any time
|
||||
5. **Override respected until escalation**: If user sets to "Warning" but a "Critical" alert joins → escalate to "Critical"
|
||||
|
||||
This ensures users are always alerted to the worst-case scenario while respecting manual judgment when appropriate.
|
||||
|
||||
---
|
||||
|
||||
### Root Cause Field
|
||||
|
||||
**Q18: What is the structure of the root cause field?**
|
||||
|
||||
Simple **text field** on AlertEpisode. Users can document their root cause analysis as free-form text. Future enhancement: AI-assisted summarization based on alert data.
|
||||
|
||||
---
|
||||
|
||||
### Flapping Prevention Configuration
|
||||
|
||||
**Q19: Where are flapping prevention settings configured?**
|
||||
|
||||
**Per-rule** on AlertGroupingRule:
|
||||
- `resolveDelayMinutes` - Grace period before auto-resolving
|
||||
- `reopenWindowMinutes` - Window after resolution where episode can be reopened
|
||||
|
||||
Each rule can have different flapping prevention settings based on the type of alerts it handles.
|
||||
|
||||
---
|
||||
|
||||
### Manual Episode Creation
|
||||
|
||||
**Q20: Can users create episodes manually without a grouping rule?**
|
||||
|
||||
**Yes** - users can manually create episodes and add alerts to them. This allows:
|
||||
- Ad-hoc grouping for incidents that don't match existing rules
|
||||
- Post-incident organization of related alerts
|
||||
- Flexibility for edge cases not covered by automated rules
|
||||
|
||||
---
|
||||
|
||||
### Episode Deletion
|
||||
|
||||
**Q21: Can episodes be deleted? What happens to member alerts?**
|
||||
|
||||
**Yes** - episodes can be deleted, but alerts must be **removed first** to make them standalone. This is a safety measure to prevent accidental data loss. The deletion flow:
|
||||
1. User removes all alerts from episode (alerts become standalone)
|
||||
2. User can then delete the empty episode
|
||||
|
||||
Alternatively, if alerts are still in the episode when deleted, they become standalone automatically.
|
||||
|
||||
---
|
||||
|
||||
### UI Location
|
||||
|
||||
**Q22: Where should Episodes appear in the navigation?**
|
||||
|
||||
New **sidemenu item** in the Alerts section. Episodes should have their own dedicated list page accessible from the main navigation, similar to how Alerts have their own page.
|
||||
|
||||
---
|
||||
|
||||
### Alert-to-Episode Relationship
|
||||
|
||||
**Q23: Can an alert belong to multiple episodes?**
|
||||
|
||||
**No** - an alert can only belong to **one episode at a time** (single ownership). This provides:
|
||||
- Simpler mental model for users
|
||||
- Clear state cascade without conflicts
|
||||
- Industry-standard approach (PagerDuty, Opsgenie)
|
||||
- Cleaner queries and data management
|
||||
|
||||
The `episodeId` field on Alert is singular and nullable.
|
||||
774
Docs/Plan/AlertStormDetection/1-DataModels.md
Normal file
774
Docs/Plan/AlertStormDetection/1-DataModels.md
Normal file
@@ -0,0 +1,774 @@
|
||||
# Data Models for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the database models required for Alert Storm Detection and Noise Reduction Analytics functionality.
|
||||
|
||||
## Entity Relationship Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────┐
|
||||
│ AlertStormEvent │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ status (active/resolved)│
|
||||
│ startedAt │
|
||||
│ endedAt │
|
||||
│ peakAlertRate │
|
||||
│ normalAlertRate │
|
||||
│ multiplier │
|
||||
│ affectedMonitors (JSON) │
|
||||
│ totalAlertsInStorm │
|
||||
└─────────────────────────┘
|
||||
|
||||
┌─────────────────────────┐
|
||||
│ NoiseReductionMetric │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ date │
|
||||
│ totalAlerts │
|
||||
│ deduplicated │
|
||||
│ suppressed │
|
||||
│ grouped │
|
||||
│ notificationsSent │
|
||||
│ noiseReductionPercent │
|
||||
└─────────────────────────┘
|
||||
|
||||
┌─────────────────────────┐
|
||||
│ AlertVolumeSnapshot │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ timestamp │
|
||||
│ alertCount │
|
||||
│ intervalMinutes │
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Definitions
|
||||
|
||||
### 1. AlertStormEvent
|
||||
|
||||
Records storm events for tracking and analysis.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/AlertStormEvent.ts`
|
||||
|
||||
```typescript
|
||||
import {
|
||||
Column,
|
||||
Entity,
|
||||
Index,
|
||||
JoinColumn,
|
||||
ManyToOne,
|
||||
} from 'typeorm';
|
||||
import BaseModel from './DatabaseBaseModel/DatabaseBaseModel';
|
||||
import Project from './Project';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import ColumnType from 'Common/Types/Database/ColumnType';
|
||||
import TableColumnType from 'Common/Types/Database/TableColumnType';
|
||||
import Permission from 'Common/Types/Permission';
|
||||
import IconProp from 'Common/Types/Icon/IconProp';
|
||||
|
||||
export enum StormStatus {
|
||||
Active = 'active',
|
||||
Resolved = 'resolved',
|
||||
}
|
||||
|
||||
export enum StormSeverity {
|
||||
Elevated = 'elevated', // 2x - 3x normal
|
||||
Storm = 'storm', // 3x - 5x normal
|
||||
Critical = 'critical', // > 5x normal
|
||||
}
|
||||
|
||||
export interface AffectedMonitor {
|
||||
monitorId: string;
|
||||
monitorName: string;
|
||||
alertCount: number;
|
||||
}
|
||||
|
||||
@EnableDocumentation()
|
||||
@TableAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
delete: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@CrudApiEndpoint(new Route('/alert-storm-event'))
|
||||
@TableMetadata({
|
||||
tableName: 'AlertStormEvent',
|
||||
singularName: 'Storm Event',
|
||||
pluralName: 'Storm Events',
|
||||
icon: IconProp.Alert,
|
||||
tableDescription: 'Records of alert storm events',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertStormEvent',
|
||||
})
|
||||
export default class AlertStormEvent extends BaseModel {
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// PROJECT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Entity,
|
||||
modelType: Project,
|
||||
title: 'Project',
|
||||
})
|
||||
@ManyToOne(() => Project, {
|
||||
onDelete: 'CASCADE',
|
||||
orphanedRowAction: 'delete',
|
||||
})
|
||||
@JoinColumn({ name: 'projectId' })
|
||||
public project?: Project = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// STATUS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Status',
|
||||
description: 'Current status of the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 20,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public status?: StormStatus = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Severity',
|
||||
description: 'Severity level of the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 20,
|
||||
nullable: false,
|
||||
})
|
||||
public severity?: StormSeverity = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// TIMING
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Started At',
|
||||
description: 'When the storm was first detected',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public startedAt?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Ended At',
|
||||
description: 'When the storm ended',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: true,
|
||||
})
|
||||
public endedAt?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Duration Minutes',
|
||||
description: 'Total duration of the storm in minutes',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: true,
|
||||
})
|
||||
public durationMinutes?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Peak Alert Rate',
|
||||
description: 'Peak alerts per hour during storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
})
|
||||
public peakAlertRate?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Normal Alert Rate',
|
||||
description: 'Normal alerts per hour (baseline)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
})
|
||||
public normalAlertRate?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Multiplier',
|
||||
description: 'How many times normal the peak rate was',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Decimal,
|
||||
precision: 5,
|
||||
scale: 2,
|
||||
nullable: false,
|
||||
})
|
||||
public multiplier?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Total Alerts',
|
||||
description: 'Total alerts during the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public totalAlertsInStorm?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// AFFECTED MONITORS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Affected Monitors',
|
||||
description: 'Top monitors contributing to the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: true,
|
||||
})
|
||||
public affectedMonitors?: Array<AffectedMonitor> = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// SUPPRESSION
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Boolean,
|
||||
title: 'Emergency Suppression Active',
|
||||
description: 'Whether emergency suppression was activated',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Boolean,
|
||||
nullable: false,
|
||||
default: false,
|
||||
})
|
||||
public emergencySuppressionActive?: boolean = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed During Storm',
|
||||
description: 'Alerts suppressed during storm (if emergency suppression active)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressedDuringStorm?: number = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. NoiseReductionMetric
|
||||
|
||||
Daily metrics for noise reduction analytics.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/NoiseReductionMetric.ts`
|
||||
|
||||
```typescript
|
||||
@TableMetadata({
|
||||
tableName: 'NoiseReductionMetric',
|
||||
singularName: 'Noise Reduction Metric',
|
||||
pluralName: 'Noise Reduction Metrics',
|
||||
icon: IconProp.ChartBar,
|
||||
tableDescription: 'Daily noise reduction statistics',
|
||||
})
|
||||
@Entity({
|
||||
name: 'NoiseReductionMetric',
|
||||
})
|
||||
export default class NoiseReductionMetric extends BaseModel {
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// PROJECT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DATE
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Date',
|
||||
description: 'Date for these metrics',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public date?: Date = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// ALERT COUNTS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Total Alert Triggers',
|
||||
description: 'Total number of alert triggers (before noise reduction)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public totalAlertTriggers?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Alerts Created',
|
||||
description: 'Actual alerts created (after deduplication/suppression)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public alertsCreated?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DEDUPLICATION METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Deduplicated',
|
||||
description: 'Alerts prevented by deduplication',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public deduplicated?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// SUPPRESSION METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed',
|
||||
description: 'Alerts prevented by suppression rules',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressed?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed by Maintenance',
|
||||
description: 'Alerts suppressed by maintenance windows',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressedByMaintenance?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed by Rate Limit',
|
||||
description: 'Alerts suppressed by rate limits',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressedByRateLimit?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// GROUPING METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Alerts Grouped',
|
||||
description: 'Alerts grouped into episodes',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public alertsGrouped?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Episodes Created',
|
||||
description: 'Number of episodes created',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public episodesCreated?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// NOTIFICATION METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Notifications Sent',
|
||||
description: 'Actual notifications sent (after all filtering)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public notificationsSent?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Notifications Suppressed',
|
||||
description: 'Notifications that were suppressed',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public notificationsSuppressed?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// CALCULATED METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Noise Reduction Percent',
|
||||
description: 'Overall noise reduction percentage',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Decimal,
|
||||
precision: 5,
|
||||
scale: 2,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public noiseReductionPercent?: number = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. AlertVolumeSnapshot
|
||||
|
||||
Periodic snapshots of alert volume for trend analysis.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/AlertVolumeSnapshot.ts`
|
||||
|
||||
```typescript
|
||||
@TableMetadata({
|
||||
tableName: 'AlertVolumeSnapshot',
|
||||
singularName: 'Volume Snapshot',
|
||||
pluralName: 'Volume Snapshots',
|
||||
icon: IconProp.ChartLine,
|
||||
tableDescription: 'Periodic alert volume snapshots',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertVolumeSnapshot',
|
||||
})
|
||||
export default class AlertVolumeSnapshot extends BaseModel {
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Timestamp',
|
||||
description: 'When this snapshot was taken',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public timestamp?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Alert Count',
|
||||
description: 'Number of alerts in this interval',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
})
|
||||
public alertCount?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Interval Minutes',
|
||||
description: 'Interval size in minutes (e.g., 5, 15, 60)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 5,
|
||||
})
|
||||
public intervalMinutes?: number = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Indexes
|
||||
|
||||
```sql
|
||||
-- AlertStormEvent indexes
|
||||
CREATE INDEX idx_storm_event_project_status
|
||||
ON "AlertStormEvent" ("projectId", "status", "startedAt" DESC);
|
||||
|
||||
CREATE INDEX idx_storm_event_active
|
||||
ON "AlertStormEvent" ("projectId", "status")
|
||||
WHERE "status" = 'active';
|
||||
|
||||
-- NoiseReductionMetric indexes
|
||||
CREATE INDEX idx_noise_metric_project_date
|
||||
ON "NoiseReductionMetric" ("projectId", "date" DESC);
|
||||
|
||||
CREATE UNIQUE INDEX idx_noise_metric_unique
|
||||
ON "NoiseReductionMetric" ("projectId", "date");
|
||||
|
||||
-- AlertVolumeSnapshot indexes
|
||||
CREATE INDEX idx_volume_snapshot_project_time
|
||||
ON "AlertVolumeSnapshot" ("projectId", "timestamp" DESC);
|
||||
|
||||
-- Partition by time for efficient cleanup
|
||||
-- Consider partitioning AlertVolumeSnapshot by month
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
- [ ] Create AlertStormEvent model
|
||||
- [ ] Create NoiseReductionMetric model
|
||||
- [ ] Create AlertVolumeSnapshot model
|
||||
- [ ] Register models in model registry
|
||||
- [ ] Create database migrations
|
||||
- [ ] Add indexes
|
||||
- [ ] Update API permissions
|
||||
630
Docs/Plan/AlertStormDetection/2-Backend.md
Normal file
630
Docs/Plan/AlertStormDetection/2-Backend.md
Normal file
@@ -0,0 +1,630 @@
|
||||
# Backend Implementation for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the backend services and components required for Alert Storm Detection and Noise Reduction Analytics.
|
||||
|
||||
## Core Components
|
||||
|
||||
### 1. StormDetector
|
||||
|
||||
Main service for detecting alert storms.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/StormDetector.ts`
|
||||
|
||||
```typescript
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import AlertStormEventService from '../../Services/AlertStormEventService';
|
||||
import AlertStormEvent, {
|
||||
StormStatus,
|
||||
StormSeverity,
|
||||
AffectedMonitor,
|
||||
} from '../../Models/DatabaseModels/AlertStormEvent';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
|
||||
export interface StormStatus {
|
||||
isStorm: boolean;
|
||||
severity: StormSeverity | null;
|
||||
currentRate: number;
|
||||
normalRate: number;
|
||||
multiplier: number;
|
||||
affectedMonitors?: Array<AffectedMonitor>;
|
||||
activeStormEvent?: AlertStormEvent;
|
||||
}
|
||||
|
||||
export interface StormConfig {
|
||||
// Multiplier threshold for storm detection
|
||||
stormThreshold: number; // Default: 3
|
||||
|
||||
// Multiplier threshold for critical storm
|
||||
criticalThreshold: number; // Default: 5
|
||||
|
||||
// Minimum alerts per hour to consider for storm
|
||||
minimumAlertRate: number; // Default: 10
|
||||
|
||||
// Historical lookback hours for baseline
|
||||
baselineHours: number; // Default: 24
|
||||
|
||||
// Enable emergency suppression
|
||||
enableEmergencySuppression: boolean; // Default: false
|
||||
}
|
||||
|
||||
export const DEFAULT_STORM_CONFIG: StormConfig = {
|
||||
stormThreshold: 3,
|
||||
criticalThreshold: 5,
|
||||
minimumAlertRate: 10,
|
||||
baselineHours: 24,
|
||||
enableEmergencySuppression: false,
|
||||
};
|
||||
|
||||
export default class StormDetector {
|
||||
/**
|
||||
* Check current storm status for a project
|
||||
*/
|
||||
public static async checkStatus(
|
||||
projectId: ObjectID,
|
||||
config?: Partial<StormConfig>
|
||||
): Promise<StormStatus> {
|
||||
const mergedConfig = { ...DEFAULT_STORM_CONFIG, ...config };
|
||||
|
||||
const now = new Date();
|
||||
const oneHourAgo = OneUptimeDate.addRemoveHours(now, -1);
|
||||
const baselineStart = OneUptimeDate.addRemoveHours(now, -mergedConfig.baselineHours);
|
||||
|
||||
// Get current hour's alert count
|
||||
const currentCount = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.greaterThan(oneHourAgo),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Get historical average (excluding current hour)
|
||||
const historicalCount = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(baselineStart, oneHourAgo),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const hoursInBaseline = mergedConfig.baselineHours - 1;
|
||||
const normalRate = hoursInBaseline > 0
|
||||
? historicalCount / hoursInBaseline
|
||||
: mergedConfig.minimumAlertRate;
|
||||
|
||||
const currentRate = currentCount;
|
||||
const multiplier = normalRate > 0 ? currentRate / normalRate : currentRate;
|
||||
|
||||
// Determine storm status
|
||||
let isStorm = false;
|
||||
let severity: StormSeverity | null = null;
|
||||
|
||||
if (multiplier >= mergedConfig.criticalThreshold) {
|
||||
isStorm = true;
|
||||
severity = StormSeverity.Critical;
|
||||
} else if (multiplier >= mergedConfig.stormThreshold) {
|
||||
isStorm = true;
|
||||
severity = StormSeverity.Storm;
|
||||
} else if (multiplier >= 2) {
|
||||
severity = StormSeverity.Elevated;
|
||||
}
|
||||
|
||||
// Only consider it a storm if rate is above minimum
|
||||
if (currentRate < mergedConfig.minimumAlertRate) {
|
||||
isStorm = false;
|
||||
severity = null;
|
||||
}
|
||||
|
||||
// Get affected monitors if storm
|
||||
let affectedMonitors: Array<AffectedMonitor> | undefined;
|
||||
if (isStorm) {
|
||||
affectedMonitors = await this.getTopAlertingMonitors(projectId, oneHourAgo);
|
||||
}
|
||||
|
||||
// Check for active storm event
|
||||
const activeStormEvent = await AlertStormEventService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
status: StormStatus.Active,
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
startedAt: true,
|
||||
peakAlertRate: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return {
|
||||
isStorm,
|
||||
severity,
|
||||
currentRate,
|
||||
normalRate: Math.round(normalRate * 100) / 100,
|
||||
multiplier: Math.round(multiplier * 100) / 100,
|
||||
affectedMonitors,
|
||||
activeStormEvent: activeStormEvent || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top alerting monitors
|
||||
*/
|
||||
private static async getTopAlertingMonitors(
|
||||
projectId: ObjectID,
|
||||
since: Date
|
||||
): Promise<Array<AffectedMonitor>> {
|
||||
const result = await AlertService.aggregate({
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
projectId: projectId.toString(),
|
||||
createdAt: { $gte: since },
|
||||
monitorId: { $ne: null },
|
||||
},
|
||||
},
|
||||
{
|
||||
$group: {
|
||||
_id: '$monitorId',
|
||||
count: { $sum: 1 },
|
||||
},
|
||||
},
|
||||
{ $sort: { count: -1 } },
|
||||
{ $limit: 10 },
|
||||
],
|
||||
});
|
||||
|
||||
// Get monitor names
|
||||
const monitorIds = result.map((r) => new ObjectID(r._id));
|
||||
const monitors = await MonitorService.findBy({
|
||||
query: {
|
||||
_id: QueryHelper.any(monitorIds),
|
||||
},
|
||||
select: { _id: true, name: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const monitorMap = new Map(
|
||||
monitors.map((m) => [m.id?.toString(), m.name])
|
||||
);
|
||||
|
||||
return result.map((r) => ({
|
||||
monitorId: r._id,
|
||||
monitorName: monitorMap.get(r._id) || 'Unknown',
|
||||
alertCount: r.count,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Process storm detection and create/update storm events
|
||||
*/
|
||||
public static async processStormDetection(
|
||||
projectId: ObjectID,
|
||||
config?: Partial<StormConfig>
|
||||
): Promise<void> {
|
||||
const status = await this.checkStatus(projectId, config);
|
||||
|
||||
if (status.isStorm && !status.activeStormEvent) {
|
||||
// New storm detected - create event
|
||||
await this.createStormEvent(projectId, status);
|
||||
} else if (status.isStorm && status.activeStormEvent) {
|
||||
// Storm ongoing - update event
|
||||
await this.updateStormEvent(status.activeStormEvent.id!, status);
|
||||
} else if (!status.isStorm && status.activeStormEvent) {
|
||||
// Storm ended - resolve event
|
||||
await this.resolveStormEvent(status.activeStormEvent.id!);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new storm event
|
||||
*/
|
||||
private static async createStormEvent(
|
||||
projectId: ObjectID,
|
||||
status: StormStatus
|
||||
): Promise<AlertStormEvent> {
|
||||
const event = await AlertStormEventService.create({
|
||||
data: {
|
||||
projectId,
|
||||
status: StormStatus.Active,
|
||||
severity: status.severity!,
|
||||
startedAt: new Date(),
|
||||
peakAlertRate: status.currentRate,
|
||||
normalAlertRate: status.normalRate,
|
||||
multiplier: status.multiplier,
|
||||
affectedMonitors: status.affectedMonitors,
|
||||
totalAlertsInStorm: status.currentRate,
|
||||
} as AlertStormEvent,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Send notifications
|
||||
await NotificationService.sendStormStartNotification({
|
||||
projectId,
|
||||
stormEvent: event,
|
||||
});
|
||||
|
||||
logger.info(`Storm detected for project ${projectId}: ${status.multiplier}x normal`);
|
||||
|
||||
return event;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update an ongoing storm event
|
||||
*/
|
||||
private static async updateStormEvent(
|
||||
eventId: ObjectID,
|
||||
status: StormStatus
|
||||
): Promise<void> {
|
||||
const event = await AlertStormEventService.findOneById({
|
||||
id: eventId,
|
||||
select: { peakAlertRate: true, totalAlertsInStorm: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (!event) return;
|
||||
|
||||
await AlertStormEventService.updateOneById({
|
||||
id: eventId,
|
||||
data: {
|
||||
peakAlertRate: Math.max(event.peakAlertRate || 0, status.currentRate),
|
||||
multiplier: Math.max(event.multiplier || 0, status.multiplier),
|
||||
totalAlertsInStorm: (event.totalAlertsInStorm || 0) + status.currentRate,
|
||||
affectedMonitors: status.affectedMonitors,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a storm event
|
||||
*/
|
||||
private static async resolveStormEvent(eventId: ObjectID): Promise<void> {
|
||||
const event = await AlertStormEventService.findOneById({
|
||||
id: eventId,
|
||||
select: { startedAt: true, projectId: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (!event) return;
|
||||
|
||||
const now = new Date();
|
||||
const durationMinutes = Math.round(
|
||||
(now.getTime() - event.startedAt!.getTime()) / 60000
|
||||
);
|
||||
|
||||
await AlertStormEventService.updateOneById({
|
||||
id: eventId,
|
||||
data: {
|
||||
status: StormStatus.Resolved,
|
||||
endedAt: now,
|
||||
durationMinutes,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Send notification
|
||||
await NotificationService.sendStormEndNotification({
|
||||
projectId: event.projectId!,
|
||||
stormEventId: eventId,
|
||||
durationMinutes,
|
||||
});
|
||||
|
||||
logger.info(`Storm resolved for project ${event.projectId} after ${durationMinutes} minutes`);
|
||||
}
|
||||
}
|
||||
|
||||
import MonitorService from '../../Services/MonitorService';
|
||||
import NotificationService from '../../Services/NotificationService';
|
||||
import logger from '../../Utils/Logger';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. NoiseReductionAnalytics
|
||||
|
||||
Service for calculating and retrieving noise reduction metrics.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/NoiseReductionAnalytics.ts`
|
||||
|
||||
```typescript
|
||||
import NoiseReductionMetric from '../../Models/DatabaseModels/NoiseReductionMetric';
|
||||
import NoiseReductionMetricService from '../../Services/NoiseReductionMetricService';
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import SuppressedAlertLogService from '../../Services/SuppressedAlertLogService';
|
||||
import AlertFingerprintService from '../../Services/AlertFingerprintService';
|
||||
import AlertEpisodeService from '../../Services/AlertEpisodeService';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
|
||||
export interface NoiseReductionSummary {
|
||||
period: {
|
||||
startDate: Date;
|
||||
endDate: Date;
|
||||
};
|
||||
totalAlertTriggers: number;
|
||||
alertsCreated: number;
|
||||
deduplicated: number;
|
||||
suppressed: number;
|
||||
grouped: number;
|
||||
notificationsSent: number;
|
||||
noiseReductionPercent: number;
|
||||
}
|
||||
|
||||
export default class NoiseReductionAnalytics {
|
||||
/**
|
||||
* Calculate daily noise reduction metrics for a project
|
||||
*/
|
||||
public static async calculateDailyMetrics(
|
||||
projectId: ObjectID,
|
||||
date: Date
|
||||
): Promise<NoiseReductionMetric> {
|
||||
const startOfDay = OneUptimeDate.getStartOfDay(date);
|
||||
const endOfDay = OneUptimeDate.getEndOfDay(date);
|
||||
|
||||
// Count alerts created
|
||||
const alertsCreated = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Count deduplicated
|
||||
const fingerprints = await AlertFingerprintService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
windowStartAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
select: { duplicateCount: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
const deduplicated = fingerprints.reduce(
|
||||
(sum, fp) => sum + (fp.duplicateCount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
// Count suppressed
|
||||
const suppressed = await SuppressedAlertLogService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
suppressedAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Count grouped alerts
|
||||
const alertsGrouped = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
episodeId: QueryHelper.notNull(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Count episodes created
|
||||
const episodesCreated = await AlertEpisodeService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
startedAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Calculate totals
|
||||
const totalAlertTriggers = alertsCreated + deduplicated + suppressed;
|
||||
const noiseReductionPercent = totalAlertTriggers > 0
|
||||
? ((deduplicated + suppressed) / totalAlertTriggers) * 100
|
||||
: 0;
|
||||
|
||||
// Create or update metric
|
||||
const existingMetric = await NoiseReductionMetricService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
date: startOfDay,
|
||||
},
|
||||
select: { _id: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const metricData: Partial<NoiseReductionMetric> = {
|
||||
projectId,
|
||||
date: startOfDay,
|
||||
totalAlertTriggers,
|
||||
alertsCreated,
|
||||
deduplicated,
|
||||
suppressed,
|
||||
alertsGrouped,
|
||||
episodesCreated,
|
||||
noiseReductionPercent: Math.round(noiseReductionPercent * 100) / 100,
|
||||
};
|
||||
|
||||
if (existingMetric) {
|
||||
await NoiseReductionMetricService.updateOneById({
|
||||
id: existingMetric.id!,
|
||||
data: metricData,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
return { ...existingMetric, ...metricData } as NoiseReductionMetric;
|
||||
}
|
||||
|
||||
return await NoiseReductionMetricService.create({
|
||||
data: metricData as NoiseReductionMetric,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get noise reduction summary for a date range
|
||||
*/
|
||||
public static async getSummary(
|
||||
projectId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<NoiseReductionSummary> {
|
||||
const metrics = await NoiseReductionMetricService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
date: QueryHelper.between(startDate, endDate),
|
||||
},
|
||||
select: {
|
||||
totalAlertTriggers: true,
|
||||
alertsCreated: true,
|
||||
deduplicated: true,
|
||||
suppressed: true,
|
||||
alertsGrouped: true,
|
||||
notificationsSent: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const totals = metrics.reduce(
|
||||
(acc, m) => ({
|
||||
totalAlertTriggers: acc.totalAlertTriggers + (m.totalAlertTriggers || 0),
|
||||
alertsCreated: acc.alertsCreated + (m.alertsCreated || 0),
|
||||
deduplicated: acc.deduplicated + (m.deduplicated || 0),
|
||||
suppressed: acc.suppressed + (m.suppressed || 0),
|
||||
grouped: acc.grouped + (m.alertsGrouped || 0),
|
||||
notificationsSent: acc.notificationsSent + (m.notificationsSent || 0),
|
||||
}),
|
||||
{
|
||||
totalAlertTriggers: 0,
|
||||
alertsCreated: 0,
|
||||
deduplicated: 0,
|
||||
suppressed: 0,
|
||||
grouped: 0,
|
||||
notificationsSent: 0,
|
||||
}
|
||||
);
|
||||
|
||||
const noiseReductionPercent = totals.totalAlertTriggers > 0
|
||||
? ((totals.deduplicated + totals.suppressed) / totals.totalAlertTriggers) * 100
|
||||
: 0;
|
||||
|
||||
return {
|
||||
period: { startDate, endDate },
|
||||
...totals,
|
||||
noiseReductionPercent: Math.round(noiseReductionPercent * 100) / 100,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Worker Jobs
|
||||
|
||||
#### Storm Monitor Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertStorm/Monitor.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_FIVE_MINUTES } from 'Common/Utils/CronTime';
|
||||
import StormDetector from 'Common/Server/Utils/Alert/StormDetector';
|
||||
import ProjectService from 'Common/Server/Services/ProjectService';
|
||||
|
||||
RunCron(
|
||||
'AlertStorm:Monitor',
|
||||
{ schedule: EVERY_FIVE_MINUTES, runOnStartup: false },
|
||||
async () => {
|
||||
// Get all active projects
|
||||
const projects = await ProjectService.findBy({
|
||||
query: { isBlocked: false },
|
||||
select: { _id: true },
|
||||
limit: 1000,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
for (const project of projects) {
|
||||
try {
|
||||
await StormDetector.processStormDetection(project.id!);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Error processing storm detection for project ${project.id}:`,
|
||||
error
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
#### Daily Metrics Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/NoiseReduction/DailyMetrics.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_DAY_AT_MIDNIGHT } from 'Common/Utils/CronTime';
|
||||
import NoiseReductionAnalytics from 'Common/Server/Utils/Alert/NoiseReductionAnalytics';
|
||||
import ProjectService from 'Common/Server/Services/ProjectService';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
|
||||
RunCron(
|
||||
'NoiseReduction:DailyMetrics',
|
||||
{ schedule: EVERY_DAY_AT_MIDNIGHT, runOnStartup: false },
|
||||
async () => {
|
||||
// Calculate metrics for yesterday
|
||||
const yesterday = OneUptimeDate.addRemoveDays(
|
||||
OneUptimeDate.getCurrentDate(),
|
||||
-1
|
||||
);
|
||||
|
||||
const projects = await ProjectService.findBy({
|
||||
query: { isBlocked: false },
|
||||
select: { _id: true },
|
||||
limit: 1000,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
for (const project of projects) {
|
||||
try {
|
||||
await NoiseReductionAnalytics.calculateDailyMetrics(
|
||||
project.id!,
|
||||
yesterday
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Error calculating metrics for project ${project.id}:`,
|
||||
error
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(`Calculated daily noise reduction metrics for ${projects.length} projects`);
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Storm Detection
|
||||
- [ ] Create StormDetector utility
|
||||
- [ ] Create AlertStormEventService
|
||||
- [ ] Implement storm detection algorithm
|
||||
- [ ] Create storm monitor worker job
|
||||
|
||||
### Phase 2: Notifications
|
||||
- [ ] Storm start notification
|
||||
- [ ] Storm end notification
|
||||
- [ ] Admin notification integration
|
||||
|
||||
### Phase 3: Analytics
|
||||
- [ ] Create NoiseReductionAnalytics utility
|
||||
- [ ] Create NoiseReductionMetricService
|
||||
- [ ] Implement daily metrics calculation
|
||||
- [ ] Create daily metrics worker job
|
||||
|
||||
### Phase 4: Testing
|
||||
- [ ] Unit tests for StormDetector
|
||||
- [ ] Unit tests for NoiseReductionAnalytics
|
||||
- [ ] Integration tests for worker jobs
|
||||
222
Docs/Plan/AlertStormDetection/3-API.md
Normal file
222
Docs/Plan/AlertStormDetection/3-API.md
Normal file
@@ -0,0 +1,222 @@
|
||||
# API Design for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Storm Detection and Noise Reduction Analytics.
|
||||
|
||||
## Storm Events API
|
||||
|
||||
### Get Current Storm Status
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm/status
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"isStorm": true,
|
||||
"severity": "storm",
|
||||
"currentRate": 150,
|
||||
"normalRate": 30,
|
||||
"multiplier": 5.0,
|
||||
"affectedMonitors": [
|
||||
{ "monitorId": "mon-1", "monitorName": "mysql-prod", "alertCount": 45 },
|
||||
{ "monitorId": "mon-2", "monitorName": "api-gateway", "alertCount": 32 }
|
||||
],
|
||||
"activeStormEvent": {
|
||||
"_id": "storm-event-1",
|
||||
"startedAt": "2026-01-20T10:00:00Z",
|
||||
"peakAlertRate": 180,
|
||||
"durationMinutes": 45
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### List Storm Events
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm-event
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `status` | string | Filter by status (active, resolved) |
|
||||
| `startedAt` | DateRange | Filter by start date |
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "storm-1",
|
||||
"status": "resolved",
|
||||
"severity": "critical",
|
||||
"startedAt": "2026-01-19T14:00:00Z",
|
||||
"endedAt": "2026-01-19T15:30:00Z",
|
||||
"durationMinutes": 90,
|
||||
"peakAlertRate": 250,
|
||||
"normalAlertRate": 30,
|
||||
"multiplier": 8.33,
|
||||
"totalAlertsInStorm": 450,
|
||||
"affectedMonitors": [...]
|
||||
}
|
||||
],
|
||||
"count": 15
|
||||
}
|
||||
```
|
||||
|
||||
### Get Storm Event Details
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm-event/{eventId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Noise Reduction Analytics API
|
||||
|
||||
### Get Noise Reduction Summary
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/noise-reduction/summary
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"period": {
|
||||
"startDate": "2026-01-13T00:00:00Z",
|
||||
"endDate": "2026-01-20T00:00:00Z"
|
||||
},
|
||||
"totalAlertTriggers": 10000,
|
||||
"alertsCreated": 3500,
|
||||
"deduplicated": 4000,
|
||||
"suppressed": 2500,
|
||||
"grouped": 1500,
|
||||
"notificationsSent": 2000,
|
||||
"noiseReductionPercent": 65.0,
|
||||
"breakdown": {
|
||||
"byDeduplication": 40.0,
|
||||
"bySuppression": 25.0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Get Daily Metrics
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/noise-reduction/daily
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"date": "2026-01-20",
|
||||
"totalAlertTriggers": 1500,
|
||||
"alertsCreated": 500,
|
||||
"deduplicated": 600,
|
||||
"suppressed": 400,
|
||||
"alertsGrouped": 200,
|
||||
"episodesCreated": 15,
|
||||
"noiseReductionPercent": 66.67
|
||||
},
|
||||
{
|
||||
"date": "2026-01-19",
|
||||
"totalAlertTriggers": 1200,
|
||||
"alertsCreated": 450
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Get Top Noise Sources
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/noise-reduction/top-sources
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"byMonitor": [
|
||||
{ "monitorId": "mon-1", "monitorName": "mysql-prod", "alertCount": 500, "duplicateCount": 300 },
|
||||
{ "monitorId": "mon-2", "monitorName": "api-gateway", "alertCount": 350, "duplicateCount": 150 }
|
||||
],
|
||||
"bySeverity": [
|
||||
{ "severityId": "sev-1", "severityName": "Warning", "alertCount": 600 },
|
||||
{ "severityId": "sev-2", "severityName": "Critical", "alertCount": 200 }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Storm Configuration API
|
||||
|
||||
### Get Storm Config
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm/config
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"stormThreshold": 3,
|
||||
"criticalThreshold": 5,
|
||||
"minimumAlertRate": 10,
|
||||
"baselineHours": 24,
|
||||
"enableEmergencySuppression": false,
|
||||
"notifyOnStormStart": true,
|
||||
"notifyOnStormEnd": true
|
||||
}
|
||||
```
|
||||
|
||||
### Update Storm Config
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-storm/config
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Storm API
|
||||
- [ ] GET /alert-storm/status
|
||||
- [ ] GET /alert-storm-event (list)
|
||||
- [ ] GET /alert-storm-event/:id
|
||||
- [ ] GET /alert-storm/config
|
||||
- [ ] PUT /alert-storm/config
|
||||
|
||||
### Analytics API
|
||||
- [ ] GET /noise-reduction/summary
|
||||
- [ ] GET /noise-reduction/daily
|
||||
- [ ] GET /noise-reduction/top-sources
|
||||
519
Docs/Plan/AlertStormDetection/4-UI.md
Normal file
519
Docs/Plan/AlertStormDetection/4-UI.md
Normal file
@@ -0,0 +1,519 @@
|
||||
# UI Implementation for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Storm Detection and Noise Reduction Analytics functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
├── Alerts
|
||||
│ ├── Alerts (existing)
|
||||
│ ├── Episodes (from Grouping plan)
|
||||
│ └── Storm History (NEW)
|
||||
└── Settings
|
||||
└── Alerts
|
||||
├── Alert States (existing)
|
||||
├── Alert Severities (existing)
|
||||
├── Grouping Rules
|
||||
├── Suppression Rules
|
||||
├── Deduplication
|
||||
└── Storm Detection (NEW)
|
||||
|
||||
Analytics (NEW section or add to existing)
|
||||
└── Noise Reduction Dashboard (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Storm Detection Settings Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertStormDetection.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-storm-detection`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Alert Storm Detection │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Alert Storm Detection identifies when alert volume spikes abnormally above │ │
|
||||
│ │ historical baselines. This helps identify major incidents and prevent alert │ │
|
||||
│ │ fatigue during outages. │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ DETECTION THRESHOLDS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Storm Threshold (multiplier) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 3 │ x normal rate │
|
||||
│ └──────────┘ │
|
||||
│ Alert volume must exceed this multiplier to be considered a storm. │
|
||||
│ │
|
||||
│ Critical Storm Threshold (multiplier) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 5 │ x normal rate │
|
||||
│ └──────────┘ │
|
||||
│ Storms exceeding this multiplier are marked as critical. │
|
||||
│ │
|
||||
│ Minimum Alert Rate │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 10 │ alerts per hour │
|
||||
│ └──────────┘ │
|
||||
│ Minimum baseline rate required before storm detection activates. │
|
||||
│ │
|
||||
│ Baseline Period │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 24 │ hours │
|
||||
│ └──────────┘ │
|
||||
│ Historical period used to calculate normal alert rate. │
|
||||
│ │
|
||||
│ NOTIFICATIONS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ☑ Notify when storm starts │
|
||||
│ ☑ Notify when storm ends │
|
||||
│ ☐ Enable emergency suppression during storms │
|
||||
│ ⚠️ This will automatically suppress non-critical alerts during storms │
|
||||
│ │
|
||||
│ [Save Changes] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Storm History Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/StormHistory.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/storm-history`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts > Storm History │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 🔴 ACTIVE STORM Started: 45 minutes ago │ │
|
||||
│ │ │ │
|
||||
│ │ Current Rate: 150 alerts/hour (5x normal) │ │
|
||||
│ │ Peak Rate: 180 alerts/hour │ │
|
||||
│ │ Affected Monitors: 12 │ │
|
||||
│ │ │ │
|
||||
│ │ [View Details] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ STORM EVENTS [Filters ▼] │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌──────────┬──────────┬───────────┬──────────┬──────────┬──────────┬─────────────┐ │
|
||||
│ │ Status │ Severity │ Started │ Duration │ Peak │ Alerts │ Monitors │ │
|
||||
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
|
||||
│ │ 🔴 Active│ Critical │ Today │ 45m │ 180/hr │ 450 │ 12 │ │
|
||||
│ │ │ │ 10:15 AM │ │ (6x) │ │ │ │
|
||||
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
|
||||
│ │ ✅ Resv'd│ Storm │ Yesterday │ 1h 30m │ 120/hr │ 280 │ 8 │ │
|
||||
│ │ │ │ 2:30 PM │ │ (4x) │ │ │ │
|
||||
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
|
||||
│ │ ✅ Resv'd│ Critical │ Jan 18 │ 2h 15m │ 250/hr │ 620 │ 15 │ │
|
||||
│ │ │ │ 8:00 AM │ │ (8.3x) │ │ │ │
|
||||
│ └──────────┴──────────┴───────────┴──────────┴──────────┴──────────┴─────────────┘ │
|
||||
│ │
|
||||
│ ◄ Previous Page 1 of 3 Next ► │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Storm Event Detail Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/StormEventDetail.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/storm-history/:stormEventId`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Storm History > Storm Event #storm-123 │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────────────────────────────────────┬─────────────────────────────────────────┐ │
|
||||
│ │ STORM SUMMARY │ ALERT VOLUME TIMELINE │ │
|
||||
│ │ │ │ │
|
||||
│ │ Status: 🔴 Active │ 250 ─┬───────────────────────────── │ │
|
||||
│ │ Severity: Critical │ │ ╭─────╮ │ │
|
||||
│ │ Started: Jan 20, 2026 10:15 AM │ 200 ─┤ ╭╯ ╰╮ │ │
|
||||
│ │ Duration: 45 minutes (ongoing) │ │ ╭╯ ╰╮ │ │
|
||||
│ │ │ 150 ─┤ ╭╯ Peak ╰─current │ │
|
||||
│ │ Peak Alert Rate: 180/hour │ │ ╭╯ 180/hr │ │
|
||||
│ │ Normal Rate: 30/hour │ 100 ─┤ ╭╯ │ │
|
||||
│ │ Multiplier: 6x │ │ ╭╯ │ │
|
||||
│ │ │ 50 ─┤──╯ │ │
|
||||
│ │ Total Alerts: 450 │ │ baseline: 30/hr ─ ─ ─ ─ ─ ─ │ │
|
||||
│ │ Affected Monitors: 12 │ 0 ─┴───────────────────────────── │ │
|
||||
│ │ │ 10:00 10:15 10:30 10:45 │ │
|
||||
│ └───────────────────────────────────────┴─────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ TOP ALERTING MONITORS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────┬──────────┬──────────────────────────────────┐ │
|
||||
│ │ Monitor │ Alerts │ Distribution │ │
|
||||
│ ├──────────────────────────────────┼──────────┼──────────────────────────────────┤ │
|
||||
│ │ 🖥️ mysql-prod-01 │ 85 │ ██████████████████░░ 19% │ │
|
||||
│ │ 🖥️ api-gateway-main │ 72 │ ███████████████░░░░░ 16% │ │
|
||||
│ │ 🖥️ redis-cluster-a │ 58 │ ████████████░░░░░░░░ 13% │ │
|
||||
│ │ 🖥️ postgres-replica-02 │ 45 │ ██████████░░░░░░░░░░ 10% │ │
|
||||
│ │ 🖥️ load-balancer-east │ 38 │ ████████░░░░░░░░░░░░ 8% │ │
|
||||
│ └──────────────────────────────────┴──────────┴──────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ALERTS IN THIS STORM [View All] │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────────┬──────────┬──────────────────────┐ │
|
||||
│ │ ID │ Title │ Monitor │ Time │ │
|
||||
│ ├───────┼──────────────────────────────────────┼──────────┼──────────────────────┤ │
|
||||
│ │ #1234 │ Connection timeout │ mysql-01 │ 10:45 AM │ │
|
||||
│ │ #1233 │ Response time exceeded threshold │ api-gw │ 10:44 AM │ │
|
||||
│ │ #1232 │ Memory usage critical │ redis-a │ 10:43 AM │ │
|
||||
│ └───────┴──────────────────────────────────────┴──────────┴──────────────────────┘ │
|
||||
│ │
|
||||
│ Showing 3 of 450 alerts │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Noise Reduction Analytics Dashboard
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Analytics/NoiseReductionDashboard.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/analytics/noise-reduction`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Analytics > Noise Reduction [Last 7 Days ▼] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ NOISE REDUCTION OVERVIEW │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
|
||||
│ │ Alert Triggers │ │ Alerts Created │ │ Noise Reduced │ │ Notifications │ │
|
||||
│ │ │ │ │ │ │ │ │ │
|
||||
│ │ 10,000 │ │ 3,500 │ │ 65% │ │ 2,000 │ │
|
||||
│ │ total │ │ created │ │ reduction │ │ sent │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
|
||||
│ │
|
||||
│ REDUCTION BREAKDOWN │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ │ Total Alert Triggers: 10,000 │ │
|
||||
│ │ ├─────────────────────────────────────────────────────────────────────────────│ │
|
||||
│ │ │ Deduplicated │ Suppressed │ Grouped │ Created │ │
|
||||
│ │ │ 4,000 (40%) │ 2,500 (25%) │ 1,500 (15%)│ 3,500 (35%) │ │
|
||||
│ │ │ ████████████████████ │ ████████████ │ ███████ │ ██████████████ │ │
|
||||
│ │ └─────────────────────────────────────────────────────────────────────────────│ │
|
||||
│ │ │ │
|
||||
│ │ Legend: │ │
|
||||
│ │ ■ Deduplicated - Merged with existing alerts │ │
|
||||
│ │ ■ Suppressed - Blocked by suppression rules │ │
|
||||
│ │ ■ Grouped - Added to existing episodes (reduced notifications) │ │
|
||||
│ │ ■ Created - New unique alerts │ │
|
||||
│ │ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ DAILY TREND │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 2000 ─┬───────────────────────────────────────────────────────────────── │ │
|
||||
│ │ │ │ │
|
||||
│ │ 1500 ─┤ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ │ │
|
||||
│ │ │ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ │ │
|
||||
│ │ 1000 ─┤ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓ │ │
|
||||
│ │ │ ▓▓▓▓░░ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
|
||||
│ │ 500 ─┤ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
|
||||
│ │ │ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
|
||||
│ │ 0 ─┴────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ Jan 14 Jan 15 Jan 16 Jan 17 Jan 18 Jan 19 Jan 20 │ │
|
||||
│ │ │ │
|
||||
│ │ ▓ Triggers ░ Created Line: Reduction % │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ TOP NOISE SOURCES │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ By Monitor By Severity │
|
||||
│ ┌────────────────────────────────────┐ ┌────────────────────────────────────────┐ │
|
||||
│ │ 1. mysql-prod 500 alerts │ │ 1. Warning 600 alerts │ │
|
||||
│ │ ████████████████████ (300 dupe)│ │ ████████████████████████ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ 2. api-gateway 350 alerts │ │ 2. High 400 alerts │ │
|
||||
│ │ ██████████████ (150 dupe) │ │ ████████████████ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ 3. redis-cluster 280 alerts │ │ 3. Critical 200 alerts │ │
|
||||
│ │ ███████████ (180 dupe) │ │ ████████ │ │
|
||||
│ └────────────────────────────────────┘ └────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. StormStatusBanner
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/StormStatusBanner.tsx`
|
||||
|
||||
Global banner that appears when a storm is active.
|
||||
|
||||
```typescript
|
||||
interface StormStatusBannerProps {
|
||||
stormEvent: AlertStormEvent | null;
|
||||
onDismiss?: () => void;
|
||||
onViewDetails?: () => void;
|
||||
}
|
||||
```
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ⚠️ ALERT STORM DETECTED - 150 alerts/hour (5x normal) - 12 monitors affected │
|
||||
│ [View Details] [Dismiss] │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 2. StormSeverityBadge
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/StormSeverityBadge.tsx`
|
||||
|
||||
Badge showing storm severity level.
|
||||
|
||||
```typescript
|
||||
interface StormSeverityBadgeProps {
|
||||
severity: 'normal' | 'elevated' | 'storm' | 'critical';
|
||||
showLabel?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 3. AlertVolumeChart
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/AlertVolumeChart.tsx`
|
||||
|
||||
Line chart showing alert volume over time.
|
||||
|
||||
```typescript
|
||||
interface AlertVolumeChartProps {
|
||||
data: Array<{
|
||||
timestamp: Date;
|
||||
alertCount: number;
|
||||
baseline?: number;
|
||||
}>;
|
||||
showBaseline?: boolean;
|
||||
highlightStormPeriods?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 4. NoiseReductionSummaryCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/NoiseReductionSummaryCard.tsx`
|
||||
|
||||
Card showing noise reduction summary statistics.
|
||||
|
||||
```typescript
|
||||
interface NoiseReductionSummaryCardProps {
|
||||
totalTriggers: number;
|
||||
alertsCreated: number;
|
||||
deduplicated: number;
|
||||
suppressed: number;
|
||||
grouped: number;
|
||||
reductionPercent: number;
|
||||
}
|
||||
```
|
||||
|
||||
### 5. NoiseReductionBreakdownChart
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/NoiseReductionBreakdownChart.tsx`
|
||||
|
||||
Stacked bar or donut chart showing reduction breakdown.
|
||||
|
||||
```typescript
|
||||
interface NoiseReductionBreakdownChartProps {
|
||||
deduplicated: number;
|
||||
suppressed: number;
|
||||
grouped: number;
|
||||
created: number;
|
||||
}
|
||||
```
|
||||
|
||||
### 6. TopNoiseSourcesTable
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/TopNoiseSourcesTable.tsx`
|
||||
|
||||
Table showing top noise-generating monitors or services.
|
||||
|
||||
```typescript
|
||||
interface TopNoiseSourcesTableProps {
|
||||
sources: Array<{
|
||||
id: string;
|
||||
name: string;
|
||||
alertCount: number;
|
||||
duplicateCount: number;
|
||||
}>;
|
||||
groupBy: 'monitor' | 'severity' | 'service';
|
||||
}
|
||||
```
|
||||
|
||||
### 7. DailyMetricsChart
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/DailyMetricsChart.tsx`
|
||||
|
||||
Bar chart showing daily noise reduction metrics.
|
||||
|
||||
```typescript
|
||||
interface DailyMetricsChartProps {
|
||||
data: Array<{
|
||||
date: string;
|
||||
totalTriggers: number;
|
||||
alertsCreated: number;
|
||||
reductionPercent: number;
|
||||
}>;
|
||||
}
|
||||
```
|
||||
|
||||
### 8. AffectedMonitorsTable
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/AffectedMonitorsTable.tsx`
|
||||
|
||||
Table showing monitors contributing to a storm.
|
||||
|
||||
```typescript
|
||||
interface AffectedMonitorsTableProps {
|
||||
monitors: Array<{
|
||||
monitorId: string;
|
||||
monitorName: string;
|
||||
alertCount: number;
|
||||
percentage: number;
|
||||
}>;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to route configuration:
|
||||
|
||||
```typescript
|
||||
// Storm Detection Settings
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-storm-detection',
|
||||
component: AlertStormDetectionSettingsPage,
|
||||
}
|
||||
|
||||
// Storm History
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/storm-history',
|
||||
component: StormHistoryPage,
|
||||
}
|
||||
|
||||
// Storm Event Detail
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/storm-history/:stormEventId',
|
||||
component: StormEventDetailPage,
|
||||
}
|
||||
|
||||
// Noise Reduction Analytics
|
||||
{
|
||||
path: '/dashboard/:projectId/analytics/noise-reduction',
|
||||
component: NoiseReductionDashboardPage,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Global Integration
|
||||
|
||||
### Dashboard Header Integration
|
||||
|
||||
Add storm status indicator to main dashboard header:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ OneUptime [Projects ▼] Alerts Monitors Status Pages 🔴 Storm Active │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Alerts Page Integration
|
||||
|
||||
Add storm status banner above alerts list when storm is active:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ⚠️ Alert Storm Active - Click to view details │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [Normal alerts table continues below...] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Storm Detection settings page
|
||||
- [ ] Storm History list page
|
||||
- [ ] Storm Event detail page
|
||||
- [ ] Noise Reduction Analytics dashboard
|
||||
|
||||
### Components
|
||||
- [ ] StormStatusBanner
|
||||
- [ ] StormSeverityBadge
|
||||
- [ ] AlertVolumeChart
|
||||
- [ ] NoiseReductionSummaryCard
|
||||
- [ ] NoiseReductionBreakdownChart
|
||||
- [ ] TopNoiseSourcesTable
|
||||
- [ ] DailyMetricsChart
|
||||
- [ ] AffectedMonitorsTable
|
||||
|
||||
### Global Integrations
|
||||
- [ ] Add storm indicator to dashboard header
|
||||
- [ ] Add storm banner to Alerts page
|
||||
- [ ] Add sidebar navigation items
|
||||
|
||||
### Styling
|
||||
- [ ] Storm severity color scheme (yellow/orange/red)
|
||||
- [ ] Chart styling for analytics
|
||||
- [ ] Banner animation styles
|
||||
- [ ] Responsive layouts
|
||||
|
||||
### Data Fetching
|
||||
- [ ] Storm status polling (every 30 seconds when on dashboard)
|
||||
- [ ] Storm events API integration
|
||||
- [ ] Noise reduction metrics API integration
|
||||
- [ ] WebSocket support for real-time storm updates (optional)
|
||||
159
Docs/Plan/AlertStormDetection/README.md
Normal file
159
Docs/Plan/AlertStormDetection/README.md
Normal file
@@ -0,0 +1,159 @@
|
||||
# Alert Storm Detection Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Storm Detection and Analytics functionality for OneUptime. This feature detects when alert volume spikes abnormally and provides noise reduction analytics.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and storm detector |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is Alert Storm Detection?
|
||||
|
||||
Alert Storm Detection identifies when the rate of incoming alerts significantly exceeds normal patterns. This helps operators understand when something unusual is happening and optionally enables automatic suppression during storms.
|
||||
|
||||
### Storm Detection Logic
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Storm Detection Algorithm │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
Current Hour Historical Average Storm Check
|
||||
┌─────────────────┐ ┌─────────────────────┐ ┌─────────────┐
|
||||
│ │ │ │ │ │
|
||||
│ 150 alerts │ vs │ 30 alerts/hour │ = │ 5x normal │
|
||||
│ (this hour) │ │ (last 24h avg) │ │ = STORM! │
|
||||
│ │ │ │ │ │
|
||||
└─────────────────┘ └─────────────────────┘ └─────────────┘
|
||||
```
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Storm Detection** - Identify abnormal alert spikes
|
||||
2. **Historical Analysis** - Compare against baseline patterns
|
||||
3. **Storm Alerts** - Notify admins when storm detected
|
||||
4. **Emergency Suppression** - Optional auto-suppression during storms
|
||||
5. **Noise Reduction Analytics** - Track overall noise reduction metrics
|
||||
6. **Top Alerting Sources** - Identify which monitors/services cause most noise
|
||||
|
||||
### Storm Thresholds
|
||||
|
||||
| Level | Multiplier | Description |
|
||||
|-------|------------|-------------|
|
||||
| Normal | < 2x | Within normal variance |
|
||||
| Elevated | 2x - 3x | Higher than usual |
|
||||
| Storm | 3x - 5x | Significant spike |
|
||||
| Critical Storm | > 5x | Major incident likely |
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an SRE, I want to be notified when an alert storm starts
|
||||
so that I know something significant is happening.
|
||||
|
||||
As an operator, I want to see which monitors are causing the most alerts
|
||||
so that I can prioritize investigation.
|
||||
|
||||
As a team lead, I want to see noise reduction metrics
|
||||
so that I can measure the effectiveness of our alert tuning.
|
||||
|
||||
As an admin, I want to enable emergency suppression during storms
|
||||
so that my team isn't overwhelmed during major incidents.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Storm Detection Core (Week 1)
|
||||
|
||||
- [ ] Create AlertStormEvent model
|
||||
- [ ] Implement StormDetector service
|
||||
- [ ] Create storm monitoring worker job
|
||||
- [ ] Add storm detection settings
|
||||
|
||||
### Phase 2: Storm Notifications (Week 2)
|
||||
|
||||
- [ ] Storm start/end notifications
|
||||
- [ ] Top alerting monitors identification
|
||||
- [ ] Storm event timeline
|
||||
- [ ] Admin notifications
|
||||
|
||||
### Phase 3: Noise Reduction Analytics (Week 3)
|
||||
|
||||
- [ ] Create NoiseReductionMetric model
|
||||
- [ ] Daily metrics calculation job
|
||||
- [ ] Deduplication statistics
|
||||
- [ ] Suppression statistics
|
||||
- [ ] Grouping statistics
|
||||
|
||||
### Phase 4: UI Dashboard (Week 4)
|
||||
|
||||
- [ ] Storm status banner
|
||||
- [ ] Noise reduction dashboard
|
||||
- [ ] Alert volume charts
|
||||
- [ ] Top alerting sources view
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Storm Detection Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌───────────────────┐
|
||||
│ Worker Job │
|
||||
│ (Every 5 min) │
|
||||
└─────────┬─────────┘
|
||||
│
|
||||
▼
|
||||
┌───────────────────────────────────────────────────────────────────────────┐
|
||||
│ StormDetector.checkStatus() │
|
||||
├───────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────────┐ │
|
||||
│ │ Get current │──▶│ Get historical │──▶│ Calculate multiplier │ │
|
||||
│ │ hour count │ │ average │ │ current / historical │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌─────────┴─────────┐ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐│
|
||||
│ │ multiplier < 3 │ │ multiplier >= 3 ││
|
||||
│ │ = Normal │ │ = STORM ││
|
||||
│ └─────────────────┘ └────────┬────────┘│
|
||||
│ │ │
|
||||
└──────────────────────────────────────────────────────────────────┼─────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ Storm Actions: │
|
||||
│ - Create AlertStormEvent│
|
||||
│ - Notify admins │
|
||||
│ - Show banner │
|
||||
│ - Optional: auto-suppress│
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Storm detection accuracy | > 95% |
|
||||
| Detection latency | < 5 minutes |
|
||||
| False positive rate | < 5% |
|
||||
| Noise reduction visibility | 100% of projects |
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Alert Grouping Plan](../AlertGrouping/README.md)
|
||||
- [Alert Suppression Plan](../AlertSuppression/README.md)
|
||||
- [Alert Deduplication Plan](../AlertDeduplication/README.md)
|
||||
1117
Docs/Plan/AlertSuppression/1-DataModels.md
Normal file
1117
Docs/Plan/AlertSuppression/1-DataModels.md
Normal file
File diff suppressed because it is too large
Load Diff
909
Docs/Plan/AlertSuppression/2-Backend.md
Normal file
909
Docs/Plan/AlertSuppression/2-Backend.md
Normal file
@@ -0,0 +1,909 @@
|
||||
# Backend Implementation for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the backend services and components required for Alert Suppression functionality.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Suppression Evaluation Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────┐
|
||||
│ Alert Trigger │
|
||||
│ (Monitor/Manual) │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ SuppressionEngine.evaluate() │
|
||||
├──────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────┐ │
|
||||
│ │ 1. Get enabled │ │
|
||||
│ │ rules │ │
|
||||
│ └────────┬────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 2. For each rule (sorted by priority): │ │
|
||||
│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │
|
||||
│ │ │ Match Criteria│─▶│ Check if Rule │─▶│ Apply Action │ │ │
|
||||
│ │ │ Evaluation │ │ is Active │ │ │ │ │
|
||||
│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌───────────────┼───────────────┐ │ │
|
||||
│ │ ▼ ▼ ▼ │ │
|
||||
│ │ ┌─────────────────┐ ┌──────────────┐ ┌─────────────────┐ │ │
|
||||
│ │ │ Maintenance │ │ Condition │ │ Rate Limit │ │ │
|
||||
│ │ │ Window Active? │ │ Met? │ │ Exceeded? │ │ │
|
||||
│ │ └─────────────────┘ └──────────────┘ └─────────────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────┐ │
|
||||
│ │ 3. Determine │ │
|
||||
│ │ final action │ │
|
||||
│ └─────────────────┘ │
|
||||
│ │
|
||||
└──────────────────────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────┴──────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌────────┐ ┌────────┐
|
||||
│SUPPRESS│ │ ALLOW │
|
||||
└────────┘ └────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Services to Create
|
||||
|
||||
### 1. AlertSuppressionRuleService
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertSuppressionRuleService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import AlertSuppressionRule, {
|
||||
SuppressionRuleType,
|
||||
} from '../Models/DatabaseModels/AlertSuppressionRule';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import SortOrder from 'Common/Types/BaseDatabase/SortOrder';
|
||||
|
||||
export class Service extends DatabaseService<AlertSuppressionRule> {
|
||||
public constructor() {
|
||||
super(AlertSuppressionRule);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all enabled rules for a project, sorted by priority
|
||||
*/
|
||||
public async getEnabledRulesForProject(
|
||||
projectId: ObjectID
|
||||
): Promise<Array<AlertSuppressionRule>> {
|
||||
return await this.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
isEnabled: true,
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
name: true,
|
||||
type: true,
|
||||
matchCriteria: true,
|
||||
maintenanceWindow: true,
|
||||
condition: true,
|
||||
rateLimit: true,
|
||||
action: true,
|
||||
suppressionGroupId: true,
|
||||
priority: true,
|
||||
},
|
||||
sort: { priority: SortOrder.Ascending },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active maintenance windows
|
||||
*/
|
||||
public async getActiveMaintenanceWindows(
|
||||
projectId: ObjectID
|
||||
): Promise<Array<AlertSuppressionRule>> {
|
||||
const rules = await this.getEnabledRulesForProject(projectId);
|
||||
|
||||
return rules.filter((rule) => {
|
||||
if (rule.type !== SuppressionRuleType.MaintenanceWindow) {
|
||||
return false;
|
||||
}
|
||||
return this.isMaintenanceWindowActive(rule);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a maintenance window is currently active
|
||||
*/
|
||||
private isMaintenanceWindowActive(rule: AlertSuppressionRule): boolean {
|
||||
const window = rule.maintenanceWindow;
|
||||
if (!window) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
||||
if (window.isRecurring && window.recurrenceRule) {
|
||||
return this.evaluateRecurrence(window, now);
|
||||
}
|
||||
|
||||
return now >= window.startTime && now <= window.endTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate recurrence rule (RRULE format)
|
||||
*/
|
||||
private evaluateRecurrence(
|
||||
window: MaintenanceWindowConfig,
|
||||
now: Date
|
||||
): boolean {
|
||||
// Use rrule library for parsing
|
||||
// This is a simplified implementation
|
||||
try {
|
||||
const RRule = require('rrule').RRule;
|
||||
const rule = RRule.fromString(window.recurrenceRule!);
|
||||
|
||||
// Get next occurrence
|
||||
const nextOccurrence = rule.after(
|
||||
new Date(now.getTime() - 24 * 60 * 60 * 1000), // Look back 24h
|
||||
true
|
||||
);
|
||||
|
||||
if (!nextOccurrence) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate window duration
|
||||
const duration = window.endTime.getTime() - window.startTime.getTime();
|
||||
const occurrenceEnd = new Date(nextOccurrence.getTime() + duration);
|
||||
|
||||
return now >= nextOccurrence && now <= occurrenceEnd;
|
||||
} catch (error) {
|
||||
logger.error('Error evaluating recurrence rule:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment suppressed count for a rule
|
||||
*/
|
||||
public async incrementSuppressedCount(ruleId: ObjectID): Promise<void> {
|
||||
await this.updateOneById({
|
||||
id: ruleId,
|
||||
data: {
|
||||
suppressedCount: QueryHelper.increment(1),
|
||||
lastTriggeredAt: new Date(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. SuppressionEngine
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/SuppressionEngine.ts`
|
||||
|
||||
```typescript
|
||||
import Alert from '../../Models/DatabaseModels/Alert';
|
||||
import AlertSuppressionRule, {
|
||||
SuppressionRuleType,
|
||||
SuppressionAction,
|
||||
SuppressionMatchCriteria,
|
||||
RateLimitConfig,
|
||||
} from '../../Models/DatabaseModels/AlertSuppressionRule';
|
||||
import AlertSuppressionRuleService from '../../Services/AlertSuppressionRuleService';
|
||||
import AlertThrottleStateService from '../../Services/AlertThrottleStateService';
|
||||
import SuppressedAlertLogService from '../../Services/SuppressedAlertLogService';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
|
||||
export interface SuppressionResult {
|
||||
shouldSuppress: boolean;
|
||||
action: SuppressionAction | 'none';
|
||||
matchedRules: Array<AlertSuppressionRule>;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export default class SuppressionEngine {
|
||||
/**
|
||||
* Evaluate all suppression rules for an alert
|
||||
*/
|
||||
public static async evaluate(
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID
|
||||
): Promise<SuppressionResult> {
|
||||
// Get all enabled suppression rules
|
||||
const rules = await AlertSuppressionRuleService.getEnabledRulesForProject(
|
||||
projectId
|
||||
);
|
||||
|
||||
if (rules.length === 0) {
|
||||
return {
|
||||
shouldSuppress: false,
|
||||
action: 'none',
|
||||
matchedRules: [],
|
||||
};
|
||||
}
|
||||
|
||||
const matchedRules: Array<AlertSuppressionRule> = [];
|
||||
let shouldSuppress = false;
|
||||
let finalAction: SuppressionAction | 'none' = 'none';
|
||||
let reason = '';
|
||||
|
||||
// Evaluate each rule in priority order
|
||||
for (const rule of rules) {
|
||||
// Check if alert matches rule criteria
|
||||
if (!await this.matchesCriteria(alertData, rule.matchCriteria)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if rule is currently active
|
||||
const isActive = await this.isRuleActive(rule, alertData, projectId);
|
||||
|
||||
if (isActive) {
|
||||
matchedRules.push(rule);
|
||||
shouldSuppress = true;
|
||||
|
||||
// Determine action (most restrictive wins)
|
||||
if (rule.action === SuppressionAction.Both || finalAction === SuppressionAction.Both) {
|
||||
finalAction = SuppressionAction.Both;
|
||||
} else if (rule.action === SuppressionAction.SuppressCreation) {
|
||||
finalAction = SuppressionAction.SuppressCreation;
|
||||
} else if (finalAction !== SuppressionAction.SuppressCreation) {
|
||||
finalAction = SuppressionAction.SuppressNotifications;
|
||||
}
|
||||
|
||||
// Build reason string
|
||||
reason = this.buildSuppressionReason(rule);
|
||||
|
||||
// If suppressing creation, no need to check more rules
|
||||
if (finalAction === SuppressionAction.SuppressCreation ||
|
||||
finalAction === SuppressionAction.Both) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Log suppression if applicable
|
||||
if (shouldSuppress && matchedRules.length > 0) {
|
||||
await this.logSuppression(alertData, matchedRules[0]!, projectId, reason, finalAction);
|
||||
}
|
||||
|
||||
return {
|
||||
shouldSuppress,
|
||||
action: finalAction,
|
||||
matchedRules,
|
||||
reason,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if alert matches rule criteria
|
||||
*/
|
||||
private static async matchesCriteria(
|
||||
alertData: Partial<Alert>,
|
||||
criteria?: SuppressionMatchCriteria
|
||||
): Promise<boolean> {
|
||||
if (!criteria || criteria.matchAll) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check severity
|
||||
if (criteria.severityIds?.length) {
|
||||
const alertSeverityId = alertData.alertSeverityId?.toString();
|
||||
if (!alertSeverityId || !criteria.severityIds.includes(alertSeverityId)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check monitors
|
||||
if (criteria.monitorIds?.length) {
|
||||
const alertMonitorId = alertData.monitorId?.toString();
|
||||
if (!alertMonitorId || !criteria.monitorIds.includes(alertMonitorId)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check labels
|
||||
if (criteria.labelIds?.length) {
|
||||
const alertLabelIds = (alertData.labels || []).map((l) =>
|
||||
l.id?.toString() || l._id?.toString()
|
||||
);
|
||||
const hasMatchingLabel = criteria.labelIds.some((id) =>
|
||||
alertLabelIds.includes(id)
|
||||
);
|
||||
if (!hasMatchingLabel) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check title pattern
|
||||
if (criteria.titlePattern) {
|
||||
try {
|
||||
const regex = new RegExp(criteria.titlePattern, 'i');
|
||||
if (!regex.test(alertData.title || '')) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
// Invalid regex, skip this check
|
||||
}
|
||||
}
|
||||
|
||||
// Check description pattern
|
||||
if (criteria.descriptionPattern) {
|
||||
try {
|
||||
const regex = new RegExp(criteria.descriptionPattern, 'i');
|
||||
if (!regex.test(alertData.description || '')) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
// Invalid regex, skip this check
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if rule is currently active
|
||||
*/
|
||||
private static async isRuleActive(
|
||||
rule: AlertSuppressionRule,
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID
|
||||
): Promise<boolean> {
|
||||
switch (rule.type) {
|
||||
case SuppressionRuleType.MaintenanceWindow:
|
||||
return this.isMaintenanceWindowActive(rule);
|
||||
|
||||
case SuppressionRuleType.ConditionBased:
|
||||
return await this.isConditionMet(rule, projectId);
|
||||
|
||||
case SuppressionRuleType.RateLimit:
|
||||
return await this.isRateLimitExceeded(rule, alertData, projectId);
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if maintenance window is active
|
||||
*/
|
||||
private static isMaintenanceWindowActive(rule: AlertSuppressionRule): boolean {
|
||||
const window = rule.maintenanceWindow;
|
||||
if (!window) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
||||
if (window.isRecurring && window.recurrenceRule) {
|
||||
return this.evaluateRecurrence(window, now);
|
||||
}
|
||||
|
||||
return now >= window.startTime && now <= window.endTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate recurrence rule
|
||||
*/
|
||||
private static evaluateRecurrence(
|
||||
window: MaintenanceWindowConfig,
|
||||
now: Date
|
||||
): boolean {
|
||||
try {
|
||||
const RRule = require('rrule').RRule;
|
||||
const rule = RRule.fromString(window.recurrenceRule!);
|
||||
|
||||
const lookbackTime = new Date(now.getTime() - 24 * 60 * 60 * 1000);
|
||||
const nextOccurrence = rule.after(lookbackTime, true);
|
||||
|
||||
if (!nextOccurrence) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const duration = window.endTime.getTime() - window.startTime.getTime();
|
||||
const occurrenceEnd = new Date(nextOccurrence.getTime() + duration);
|
||||
|
||||
return now >= nextOccurrence && now <= occurrenceEnd;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if condition is met
|
||||
*/
|
||||
private static async isConditionMet(
|
||||
rule: AlertSuppressionRule,
|
||||
projectId: ObjectID
|
||||
): Promise<boolean> {
|
||||
const condition = rule.condition;
|
||||
if (!condition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if another alert is active with specific labels
|
||||
if (condition.whenAlertActiveWithLabelIds?.length) {
|
||||
const activeAlert = await AlertService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
labels: QueryHelper.any(condition.whenAlertActiveWithLabelIds),
|
||||
currentAlertStateId: QueryHelper.notEquals(
|
||||
await AlertStateService.getResolvedStateId(projectId)
|
||||
),
|
||||
},
|
||||
select: { _id: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (activeAlert) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if monitor is in specific state
|
||||
if (condition.whenMonitorInStateIds?.length) {
|
||||
// Implementation depends on monitor state tracking
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if rate limit is exceeded
|
||||
*/
|
||||
private static async isRateLimitExceeded(
|
||||
rule: AlertSuppressionRule,
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID
|
||||
): Promise<boolean> {
|
||||
const rateLimit = rule.rateLimit;
|
||||
if (!rateLimit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compute throttle key
|
||||
const throttleKey = this.computeThrottleKey(rule, alertData);
|
||||
|
||||
// Get or create throttle state
|
||||
let state = await AlertThrottleStateService.findOneBy({
|
||||
query: {
|
||||
throttleKey,
|
||||
suppressionRuleId: rule.id!,
|
||||
windowExpiresAt: QueryHelper.greaterThan(new Date()),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
alertCount: true,
|
||||
isThrottling: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (!state) {
|
||||
// Create new throttle state
|
||||
const now = new Date();
|
||||
const windowExpires = OneUptimeDate.addRemoveMinutes(
|
||||
now,
|
||||
rateLimit.timeWindowMinutes
|
||||
);
|
||||
|
||||
await AlertThrottleStateService.create({
|
||||
data: {
|
||||
projectId,
|
||||
throttleKey,
|
||||
suppressionRuleId: rule.id!,
|
||||
alertCount: 1,
|
||||
firstAlertAt: now,
|
||||
lastAlertAt: now,
|
||||
windowExpiresAt: windowExpires,
|
||||
isThrottling: false,
|
||||
} as AlertThrottleState,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Update throttle state
|
||||
const newCount = (state.alertCount || 0) + 1;
|
||||
const shouldThrottle = newCount > rateLimit.maxAlerts;
|
||||
|
||||
await AlertThrottleStateService.updateOneById({
|
||||
id: state.id!,
|
||||
data: {
|
||||
alertCount: newCount,
|
||||
lastAlertAt: new Date(),
|
||||
isThrottling: shouldThrottle,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return shouldThrottle;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute throttle key from rule and alert data
|
||||
*/
|
||||
private static computeThrottleKey(
|
||||
rule: AlertSuppressionRule,
|
||||
alertData: Partial<Alert>
|
||||
): string {
|
||||
const parts: Array<string> = [`rule:${rule.id?.toString()}`];
|
||||
|
||||
const groupByFields = rule.rateLimit?.groupByFields || [];
|
||||
|
||||
for (const field of groupByFields) {
|
||||
switch (field) {
|
||||
case 'monitorId':
|
||||
parts.push(`monitor:${alertData.monitorId?.toString() || 'null'}`);
|
||||
break;
|
||||
case 'alertSeverityId':
|
||||
case 'severity':
|
||||
parts.push(`severity:${alertData.alertSeverityId?.toString() || 'null'}`);
|
||||
break;
|
||||
case 'title':
|
||||
parts.push(`title:${alertData.title || 'null'}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('|');
|
||||
}
|
||||
|
||||
/**
|
||||
* Build suppression reason string
|
||||
*/
|
||||
private static buildSuppressionReason(rule: AlertSuppressionRule): string {
|
||||
switch (rule.type) {
|
||||
case SuppressionRuleType.MaintenanceWindow:
|
||||
return `Suppressed by maintenance window: ${rule.name}`;
|
||||
case SuppressionRuleType.ConditionBased:
|
||||
return `Suppressed by condition: ${rule.name}`;
|
||||
case SuppressionRuleType.RateLimit:
|
||||
return `Suppressed by rate limit: ${rule.name} (max ${rule.rateLimit?.maxAlerts} per ${rule.rateLimit?.timeWindowMinutes} min)`;
|
||||
default:
|
||||
return `Suppressed by rule: ${rule.name}`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Log suppressed alert for audit trail
|
||||
*/
|
||||
private static async logSuppression(
|
||||
alertData: Partial<Alert>,
|
||||
rule: AlertSuppressionRule,
|
||||
projectId: ObjectID,
|
||||
reason: string,
|
||||
action: SuppressionAction | 'none'
|
||||
): Promise<void> {
|
||||
await SuppressedAlertLogService.create({
|
||||
data: {
|
||||
projectId,
|
||||
suppressionRuleId: rule.id,
|
||||
alertData: alertData as object,
|
||||
alertTitle: alertData.title,
|
||||
suppressionReason: reason,
|
||||
action: action as SuppressionAction,
|
||||
suppressedAt: new Date(),
|
||||
monitorId: alertData.monitorId,
|
||||
} as SuppressedAlertLog,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Increment rule counter
|
||||
await AlertSuppressionRuleService.incrementSuppressedCount(rule.id!);
|
||||
}
|
||||
}
|
||||
|
||||
// Import services at end to avoid circular dependencies
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import AlertStateService from '../../Services/AlertStateService';
|
||||
import AlertThrottleState from '../../Models/DatabaseModels/AlertThrottleState';
|
||||
import SuppressedAlertLog from '../../Models/DatabaseModels/SuppressedAlertLog';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
import logger from '../../Utils/Logger';
|
||||
import { MaintenanceWindowConfig } from '../../Models/DatabaseModels/AlertSuppressionRule';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Integration with AlertService
|
||||
|
||||
Modify `/Common/Server/Services/AlertService.ts`:
|
||||
|
||||
```typescript
|
||||
// Add import
|
||||
import SuppressionEngine from '../Utils/Alert/SuppressionEngine';
|
||||
|
||||
// In onBeforeCreate() method, add suppression check:
|
||||
protected async onBeforeCreate(
|
||||
createBy: CreateBy<Alert>
|
||||
): Promise<OnCreate<Alert>> {
|
||||
// ... existing code ...
|
||||
|
||||
// Check suppression rules
|
||||
const suppressionResult = await SuppressionEngine.evaluate(
|
||||
createBy.data,
|
||||
createBy.data.projectId!
|
||||
);
|
||||
|
||||
if (suppressionResult.shouldSuppress) {
|
||||
if (suppressionResult.action === SuppressionAction.SuppressCreation ||
|
||||
suppressionResult.action === SuppressionAction.Both) {
|
||||
// Prevent alert creation
|
||||
throw new SuppressedAlertException(
|
||||
suppressionResult.reason || 'Alert suppressed by rule'
|
||||
);
|
||||
}
|
||||
|
||||
// Mark for notification suppression
|
||||
createBy.data.notificationsSuppressed = true;
|
||||
createBy.data.suppressedByRuleId = suppressionResult.matchedRules[0]?.id;
|
||||
}
|
||||
|
||||
// ... rest of existing code ...
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. SuppressedAlertLogService
|
||||
|
||||
**File Location:** `/Common/Server/Services/SuppressedAlertLogService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import SuppressedAlertLog from '../Models/DatabaseModels/SuppressedAlertLog';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
|
||||
export class Service extends DatabaseService<SuppressedAlertLog> {
|
||||
public constructor() {
|
||||
super(SuppressedAlertLog);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get suppressed alerts for a rule
|
||||
*/
|
||||
public async getSuppressedByRule(
|
||||
ruleId: ObjectID,
|
||||
limit: number = 100
|
||||
): Promise<Array<SuppressedAlertLog>> {
|
||||
return await this.findBy({
|
||||
query: { suppressionRuleId: ruleId },
|
||||
select: {
|
||||
_id: true,
|
||||
alertTitle: true,
|
||||
suppressionReason: true,
|
||||
action: true,
|
||||
suppressedAt: true,
|
||||
monitorId: true,
|
||||
},
|
||||
sort: { suppressedAt: SortOrder.Descending },
|
||||
limit,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get suppression statistics for a project
|
||||
*/
|
||||
public async getStatistics(
|
||||
projectId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<{
|
||||
totalSuppressed: number;
|
||||
byRule: Array<{ ruleId: string; count: number }>;
|
||||
byAction: Array<{ action: string; count: number }>;
|
||||
}> {
|
||||
const totalSuppressed = await this.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
suppressedAt: QueryHelper.between(startDate, endDate),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Aggregate by rule
|
||||
const byRule = await this.aggregate({
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
projectId: projectId.toString(),
|
||||
suppressedAt: { $gte: startDate, $lte: endDate },
|
||||
},
|
||||
},
|
||||
{
|
||||
$group: {
|
||||
_id: '$suppressionRuleId',
|
||||
count: { $sum: 1 },
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Aggregate by action
|
||||
const byAction = await this.aggregate({
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
projectId: projectId.toString(),
|
||||
suppressedAt: { $gte: startDate, $lte: endDate },
|
||||
},
|
||||
},
|
||||
{
|
||||
$group: {
|
||||
_id: '$action',
|
||||
count: { $sum: 1 },
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return {
|
||||
totalSuppressed,
|
||||
byRule: byRule.map((r) => ({ ruleId: r._id, count: r.count })),
|
||||
byAction: byAction.map((a) => ({ action: a._id, count: a.count })),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. AlertThrottleStateService
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertThrottleStateService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import AlertThrottleState from '../Models/DatabaseModels/AlertThrottleState';
|
||||
|
||||
export class Service extends DatabaseService<AlertThrottleState> {
|
||||
public constructor() {
|
||||
super(AlertThrottleState);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up expired throttle states
|
||||
*/
|
||||
public async cleanupExpired(): Promise<number> {
|
||||
const result = await this.deleteBy({
|
||||
query: {
|
||||
windowExpiresAt: QueryHelper.lessThan(new Date()),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Worker Jobs
|
||||
|
||||
### 1. ThrottleStateCleanup Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertSuppression/ThrottleStateCleanup.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_HOUR } from 'Common/Utils/CronTime';
|
||||
import AlertThrottleStateService from 'Common/Server/Services/AlertThrottleStateService';
|
||||
|
||||
RunCron(
|
||||
'AlertSuppression:ThrottleStateCleanup',
|
||||
{ schedule: EVERY_HOUR, runOnStartup: false },
|
||||
async () => {
|
||||
const deletedCount = await AlertThrottleStateService.cleanupExpired();
|
||||
|
||||
if (deletedCount > 0) {
|
||||
logger.info(`Cleaned up ${deletedCount} expired throttle states`);
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
### 2. MaintenanceWindowNotification Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertSuppression/MaintenanceWindowNotification.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_MINUTE } from 'Common/Utils/CronTime';
|
||||
import AlertSuppressionRuleService from 'Common/Server/Services/AlertSuppressionRuleService';
|
||||
import { SuppressionRuleType } from 'Common/Models/DatabaseModels/AlertSuppressionRule';
|
||||
|
||||
RunCron(
|
||||
'AlertSuppression:MaintenanceWindowNotification',
|
||||
{ schedule: EVERY_MINUTE, runOnStartup: false },
|
||||
async () => {
|
||||
// Find maintenance windows starting in next 15 minutes
|
||||
const upcomingWindows = await AlertSuppressionRuleService.findBy({
|
||||
query: {
|
||||
type: SuppressionRuleType.MaintenanceWindow,
|
||||
isEnabled: true,
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
projectId: true,
|
||||
name: true,
|
||||
maintenanceWindow: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const now = new Date();
|
||||
const fifteenMinutesFromNow = new Date(now.getTime() + 15 * 60 * 1000);
|
||||
|
||||
for (const rule of upcomingWindows) {
|
||||
const window = rule.maintenanceWindow;
|
||||
if (!window) continue;
|
||||
|
||||
// Check if window starts within next 15 minutes
|
||||
if (window.startTime > now && window.startTime <= fifteenMinutesFromNow) {
|
||||
// Send notification about upcoming maintenance window
|
||||
await NotificationService.sendMaintenanceWindowNotification({
|
||||
projectId: rule.projectId!,
|
||||
ruleName: rule.name!,
|
||||
startsAt: window.startTime,
|
||||
endsAt: window.endTime,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Core Services
|
||||
- [ ] Create AlertSuppressionRuleService
|
||||
- [ ] Create AlertSuppressionGroupService
|
||||
- [ ] Create SuppressedAlertLogService
|
||||
- [ ] Create AlertThrottleStateService
|
||||
- [ ] Create SuppressionEngine
|
||||
|
||||
### Phase 2: Integration
|
||||
- [ ] Modify AlertService.onBeforeCreate()
|
||||
- [ ] Add SuppressedAlertException
|
||||
- [ ] Add notification suppression field to Alert
|
||||
|
||||
### Phase 3: Worker Jobs
|
||||
- [ ] Create ThrottleStateCleanup job
|
||||
- [ ] Create MaintenanceWindowNotification job
|
||||
- [ ] Register jobs in worker startup
|
||||
|
||||
### Phase 4: Testing
|
||||
- [ ] Unit tests for SuppressionEngine
|
||||
- [ ] Unit tests for criteria matching
|
||||
- [ ] Unit tests for rate limiting
|
||||
- [ ] Integration tests for full suppression flow
|
||||
499
Docs/Plan/AlertSuppression/3-API.md
Normal file
499
Docs/Plan/AlertSuppression/3-API.md
Normal file
@@ -0,0 +1,499 @@
|
||||
# API Design for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Suppression functionality.
|
||||
|
||||
## Base URLs
|
||||
|
||||
```
|
||||
/api/project/{projectId}/alert-suppression-rule
|
||||
/api/project/{projectId}/alert-suppression-group
|
||||
/api/project/{projectId}/suppressed-alert-log
|
||||
/api/project/{projectId}/maintenance-window
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Suppression Rules API
|
||||
|
||||
### List Suppression Rules
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-rule
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `type` | string | Filter by rule type (maintenance_window, condition_based, rate_limit) |
|
||||
| `isEnabled` | boolean | Filter by enabled status |
|
||||
| `suppressionGroupId` | ObjectID | Filter by suppression group |
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "rule-id-1",
|
||||
"name": "Nightly Maintenance Window",
|
||||
"description": "Suppress alerts during nightly deployments",
|
||||
"type": "maintenance_window",
|
||||
"isEnabled": true,
|
||||
"priority": 1,
|
||||
"matchCriteria": {
|
||||
"matchAll": true
|
||||
},
|
||||
"maintenanceWindow": {
|
||||
"startTime": "2026-01-20T02:00:00Z",
|
||||
"endTime": "2026-01-20T04:00:00Z",
|
||||
"timezone": "America/Los_Angeles",
|
||||
"isRecurring": true,
|
||||
"recurrenceRule": "FREQ=DAILY"
|
||||
},
|
||||
"action": "both",
|
||||
"suppressedCount": 156,
|
||||
"lastTriggeredAt": "2026-01-20T02:15:00Z"
|
||||
},
|
||||
{
|
||||
"_id": "rule-id-2",
|
||||
"name": "Rate Limit - 10/hour per monitor",
|
||||
"type": "rate_limit",
|
||||
"isEnabled": true,
|
||||
"priority": 2,
|
||||
"matchCriteria": {},
|
||||
"rateLimit": {
|
||||
"maxAlerts": 10,
|
||||
"timeWindowMinutes": 60,
|
||||
"groupByFields": ["monitorId"]
|
||||
},
|
||||
"action": "suppress_creation",
|
||||
"suppressedCount": 523
|
||||
}
|
||||
],
|
||||
"count": 5,
|
||||
"skip": 0,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Get Suppression Rule
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Create Suppression Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-rule
|
||||
```
|
||||
|
||||
**Request Body (Maintenance Window):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Weekend Maintenance",
|
||||
"description": "Suppress alerts during weekend maintenance",
|
||||
"type": "maintenance_window",
|
||||
"isEnabled": true,
|
||||
"priority": 1,
|
||||
"matchCriteria": {
|
||||
"labelIds": ["production-label-id"]
|
||||
},
|
||||
"maintenanceWindow": {
|
||||
"startTime": "2026-01-25T00:00:00Z",
|
||||
"endTime": "2026-01-25T06:00:00Z",
|
||||
"timezone": "America/New_York",
|
||||
"isRecurring": true,
|
||||
"recurrenceRule": "FREQ=WEEKLY;BYDAY=SA,SU"
|
||||
},
|
||||
"action": "both"
|
||||
}
|
||||
```
|
||||
|
||||
**Request Body (Rate Limit):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Alert Storm Protection",
|
||||
"description": "Limit alerts to 20 per hour per monitor",
|
||||
"type": "rate_limit",
|
||||
"isEnabled": true,
|
||||
"priority": 10,
|
||||
"matchCriteria": {
|
||||
"severityIds": ["warning-id", "info-id"]
|
||||
},
|
||||
"rateLimit": {
|
||||
"maxAlerts": 20,
|
||||
"timeWindowMinutes": 60,
|
||||
"groupByFields": ["monitorId"]
|
||||
},
|
||||
"action": "suppress_creation"
|
||||
}
|
||||
```
|
||||
|
||||
**Request Body (Condition-Based):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Suppress Staging Alerts",
|
||||
"description": "Suppress notifications for staging environment",
|
||||
"type": "condition_based",
|
||||
"isEnabled": true,
|
||||
"priority": 5,
|
||||
"matchCriteria": {
|
||||
"labelIds": ["staging-label-id"]
|
||||
},
|
||||
"condition": {},
|
||||
"action": "suppress_notifications"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Update Suppression Rule
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-suppression-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Delete Suppression Rule
|
||||
|
||||
```http
|
||||
DELETE /api/project/{projectId}/alert-suppression-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Enable/Disable Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/enable
|
||||
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/disable
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Test Suppression Rule
|
||||
|
||||
Test which alerts would be suppressed by a rule.
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/test
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertIds": ["alert-id-1", "alert-id-2", "alert-id-3"]
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"alertId": "alert-id-1",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"wouldSuppress": true,
|
||||
"action": "both",
|
||||
"reason": "Matches criteria and maintenance window is active"
|
||||
},
|
||||
{
|
||||
"alertId": "alert-id-2",
|
||||
"alertTitle": "API latency high",
|
||||
"wouldSuppress": false,
|
||||
"reason": "Does not match severity criteria"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Maintenance Windows API
|
||||
|
||||
Convenience endpoints for maintenance windows specifically.
|
||||
|
||||
### List Active Maintenance Windows
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/maintenance-window/active
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "rule-id-1",
|
||||
"name": "Nightly Maintenance",
|
||||
"startedAt": "2026-01-20T02:00:00Z",
|
||||
"endsAt": "2026-01-20T04:00:00Z",
|
||||
"remainingMinutes": 45,
|
||||
"matchCriteria": { "matchAll": true }
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### List Upcoming Maintenance Windows
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/maintenance-window/upcoming
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `hours` | number | Look ahead hours (default: 24) |
|
||||
|
||||
---
|
||||
|
||||
### Quick Create Maintenance Window
|
||||
|
||||
Simplified endpoint for creating one-time maintenance windows.
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/maintenance-window/quick
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Emergency Deployment",
|
||||
"durationMinutes": 60,
|
||||
"matchCriteria": {
|
||||
"monitorIds": ["monitor-1", "monitor-2"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Creates a maintenance window starting immediately.
|
||||
|
||||
---
|
||||
|
||||
## Suppression Groups API
|
||||
|
||||
### List Suppression Groups
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-group
|
||||
```
|
||||
|
||||
### Create Suppression Group
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-group
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Database Alerts",
|
||||
"description": "Group for database-related suppression rules",
|
||||
"throttleMinutes": 30
|
||||
}
|
||||
```
|
||||
|
||||
### Get Group with Rules
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-group/{groupId}/rules
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Suppressed Alert Log API
|
||||
|
||||
### List Suppressed Alerts
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/suppressed-alert-log
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `suppressionRuleId` | ObjectID | Filter by rule |
|
||||
| `monitorId` | ObjectID | Filter by monitor |
|
||||
| `action` | string | Filter by action |
|
||||
| `suppressedAt` | DateRange | Filter by date |
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "log-id-1",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"suppressionRule": {
|
||||
"_id": "rule-id",
|
||||
"name": "Nightly Maintenance"
|
||||
},
|
||||
"suppressionReason": "Suppressed by maintenance window: Nightly Maintenance",
|
||||
"action": "both",
|
||||
"suppressedAt": "2026-01-20T02:15:00Z",
|
||||
"monitor": {
|
||||
"_id": "monitor-id",
|
||||
"name": "MySQL Production"
|
||||
},
|
||||
"alertData": {
|
||||
"title": "MySQL connection timeout",
|
||||
"description": "Connection to MySQL timed out after 30s",
|
||||
"severity": "High"
|
||||
}
|
||||
}
|
||||
],
|
||||
"count": 156,
|
||||
"skip": 0,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
### Get Suppression Statistics
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/suppressed-alert-log/statistics
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"period": {
|
||||
"startDate": "2026-01-13T00:00:00Z",
|
||||
"endDate": "2026-01-20T00:00:00Z"
|
||||
},
|
||||
"totalSuppressed": 1234,
|
||||
"byRule": [
|
||||
{ "ruleId": "rule-1", "ruleName": "Nightly Maintenance", "count": 523 },
|
||||
{ "ruleId": "rule-2", "ruleName": "Rate Limit", "count": 711 }
|
||||
],
|
||||
"byAction": [
|
||||
{ "action": "suppress_creation", "count": 890 },
|
||||
{ "action": "suppress_notifications", "count": 244 },
|
||||
{ "action": "both", "count": 100 }
|
||||
],
|
||||
"byDay": [
|
||||
{ "date": "2026-01-13", "count": 156 },
|
||||
{ "date": "2026-01-14", "count": 178 },
|
||||
{ "date": "2026-01-15", "count": 145 }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Permissions
|
||||
|
||||
| Endpoint | Required Permission |
|
||||
|----------|---------------------|
|
||||
| GET suppression rules | `ProjectMember` |
|
||||
| Create/Update/Delete rules | `ProjectAdmin` |
|
||||
| Enable/Disable rules | `ProjectAdmin` |
|
||||
| GET suppressed logs | `ProjectMember` |
|
||||
| GET statistics | `ProjectMember` |
|
||||
|
||||
---
|
||||
|
||||
## Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "INVALID_RECURRENCE_RULE",
|
||||
"message": "Invalid RRULE format: FREQ=INVALID"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Error Codes:**
|
||||
|
||||
| Code | Description |
|
||||
|------|-------------|
|
||||
| `INVALID_RECURRENCE_RULE` | Invalid RRULE format |
|
||||
| `INVALID_TIME_WINDOW` | End time before start time |
|
||||
| `RULE_NOT_FOUND` | Suppression rule doesn't exist |
|
||||
| `CANNOT_DELETE_ACTIVE_WINDOW` | Cannot delete currently active maintenance window |
|
||||
| `OVERLAPPING_WINDOWS` | Maintenance windows overlap (warning only) |
|
||||
|
||||
---
|
||||
|
||||
## Webhooks
|
||||
|
||||
### Suppression Events
|
||||
|
||||
Configure webhooks to receive suppression events:
|
||||
|
||||
```json
|
||||
{
|
||||
"event": "alert.suppressed",
|
||||
"timestamp": "2026-01-20T02:15:00Z",
|
||||
"data": {
|
||||
"projectId": "project-id",
|
||||
"suppressionRuleId": "rule-id",
|
||||
"suppressionRuleName": "Nightly Maintenance",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"action": "both",
|
||||
"reason": "Maintenance window active"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Suppression Rule API
|
||||
- [ ] GET /alert-suppression-rule (list)
|
||||
- [ ] GET /alert-suppression-rule/:id (details)
|
||||
- [ ] POST /alert-suppression-rule (create)
|
||||
- [ ] PUT /alert-suppression-rule/:id (update)
|
||||
- [ ] DELETE /alert-suppression-rule/:id (delete)
|
||||
- [ ] POST /alert-suppression-rule/:id/enable
|
||||
- [ ] POST /alert-suppression-rule/:id/disable
|
||||
- [ ] POST /alert-suppression-rule/:id/test
|
||||
|
||||
### Maintenance Window API
|
||||
- [ ] GET /maintenance-window/active
|
||||
- [ ] GET /maintenance-window/upcoming
|
||||
- [ ] POST /maintenance-window/quick
|
||||
|
||||
### Suppression Group API
|
||||
- [ ] GET /alert-suppression-group (list)
|
||||
- [ ] POST /alert-suppression-group (create)
|
||||
- [ ] GET /alert-suppression-group/:id/rules
|
||||
|
||||
### Suppressed Log API
|
||||
- [ ] GET /suppressed-alert-log (list)
|
||||
- [ ] GET /suppressed-alert-log/statistics
|
||||
464
Docs/Plan/AlertSuppression/4-UI.md
Normal file
464
Docs/Plan/AlertSuppression/4-UI.md
Normal file
@@ -0,0 +1,464 @@
|
||||
# UI Implementation for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Suppression functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
├── Alerts
|
||||
│ ├── All Alerts (existing)
|
||||
│ ├── Episodes
|
||||
│ └── Suppressed Alerts (NEW)
|
||||
└── Settings
|
||||
├── Alerts
|
||||
│ ├── Alert States (existing)
|
||||
│ ├── Alert Severities (existing)
|
||||
│ ├── Grouping Rules
|
||||
│ └── Suppression Rules (NEW)
|
||||
└── Maintenance
|
||||
└── Maintenance Windows (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Suppression Rules List Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertSuppressionRules.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-suppression-rules`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Suppression Rules [+ Create Rule] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Suppression rules prevent alert creation or notifications based on conditions. │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [All] [Maintenance Windows] [Rate Limits] [Condition-Based] │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Nightly Maintenance Window Priority: 1 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 🕐 Type: Maintenance Window │ │
|
||||
│ │ 📅 Schedule: Daily 2:00 AM - 4:00 AM PST │ │
|
||||
│ │ 🎯 Affects: All monitors │ │
|
||||
│ │ 🚫 Action: Suppress creation and notifications │ │
|
||||
│ │ 📊 Suppressed: 523 alerts │ │
|
||||
│ │ [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Rate Limit: 10 alerts/hour per monitor Priority: 2 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 📈 Type: Rate Limit │ │
|
||||
│ │ ⚡ Limit: 10 alerts per 60 minutes │ │
|
||||
│ │ 📦 Group by: Monitor │ │
|
||||
│ │ 🚫 Action: Suppress creation after threshold │ │
|
||||
│ │ 📊 Suppressed: 1,247 alerts │ │
|
||||
│ │ [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ❌ Staging Environment (Disabled) Priority: 3 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 🔧 Type: Condition-Based │ │
|
||||
│ │ 🎯 Matches: Labels contain "staging" │ │
|
||||
│ │ 🚫 Action: Suppress notifications only │ │
|
||||
│ │ [Enable] [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Create/Edit Suppression Rule Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertSuppressionRuleView/Index.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-suppression-rules/create`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Create Suppression Rule │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ BASIC INFORMATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Rule Name * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Nightly Maintenance Window │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Description │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Suppress all alerts during nightly deployment window │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Rule Type * │
|
||||
│ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ │
|
||||
│ │ ● Maintenance Window │ │ ○ Condition-Based │ │ ○ Rate Limit │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ Time-based │ │ Attribute-based │ │ Threshold-based │ │
|
||||
│ │ suppression │ │ suppression │ │ suppression │ │
|
||||
│ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ │
|
||||
│ │
|
||||
│ MATCHING CRITERIA │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ Which alerts should this rule apply to? │
|
||||
│ │
|
||||
│ ○ All alerts │
|
||||
│ ● Alerts matching specific criteria │
|
||||
│ │
|
||||
│ Severities (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [Critical ×] [High ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Monitors (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Select monitors... [Browse] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Labels (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [production ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ MAINTENANCE WINDOW │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Start Date & Time * End Date & Time * │
|
||||
│ ┌────────────────────────────────┐ ┌────────────────────────────────┐ │
|
||||
│ │ 2026-01-20 02:00 AM │ │ 2026-01-20 04:00 AM │ │
|
||||
│ └────────────────────────────────┘ └────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Timezone * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ America/Los_Angeles (PST) [▼] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ☑ Recurring window │
|
||||
│ │
|
||||
│ Repeat * │
|
||||
│ ┌───────────────┐ │
|
||||
│ │ Daily [▼] │ │
|
||||
│ └───────────────┘ │
|
||||
│ │
|
||||
│ ○ Never ends │
|
||||
│ ● Ends on: [2026-12-31] │
|
||||
│ │
|
||||
│ SUPPRESSION ACTION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ What should happen when this rule matches? * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ● Suppress both alert creation and notifications (Recommended) │ │
|
||||
│ │ ○ Suppress alert creation only │ │
|
||||
│ │ ○ Suppress notifications only (alert is still created) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Priority (lower = evaluated first) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 1 │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ [Cancel] [Test Rule] [Save] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Rate Limit Rule Form (Conditional Section)
|
||||
|
||||
When "Rate Limit" is selected as rule type:
|
||||
|
||||
```
|
||||
│ RATE LIMIT CONFIGURATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Maximum alerts allowed * │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 10 │ alerts │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Time window * │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 60 │ minutes │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Group rate limit by: │
|
||||
│ ☑ Monitor (separate limit per monitor) │
|
||||
│ ☐ Severity (separate limit per severity) │
|
||||
│ ☐ None (global limit) │
|
||||
│ │
|
||||
│ Example: With these settings, each monitor can generate up to 10 alerts per hour. │
|
||||
│ Additional alerts from the same monitor will be suppressed. │
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Suppressed Alerts Log Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/SuppressedAlerts.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/suppressed`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts > Suppressed Alerts │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 📊 Last 7 Days: 1,234 alerts suppressed (saves ~40% notification volume) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Filters: │
|
||||
│ [Rule: All ▼] [Monitor: All ▼] [Action: All ▼] [Date: Last 7 days ▼] │
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────┬───────────────┬──────────┬───────────┐│
|
||||
│ │ Time │ Alert Title │ Rule │ Action │ Monitor ││
|
||||
│ ├───────┼──────────────────────────────────┼───────────────┼──────────┼───────────┤│
|
||||
│ │ 2:15 │ MySQL connection timeout │ Nightly Maint │ Both │ mysql-01 ││
|
||||
│ │ 2:14 │ MySQL connection timeout │ Nightly Maint │ Both │ mysql-01 ││
|
||||
│ │ 2:12 │ API response time > 5s │ Rate Limit │ Creation │ api-gw ││
|
||||
│ │ 2:10 │ Disk space warning │ Rate Limit │ Creation │ web-03 ││
|
||||
│ │ 2:08 │ Memory usage high │ Nightly Maint │ Both │ app-01 ││
|
||||
│ └───────┴──────────────────────────────────┴───────────────┴──────────┴───────────┘│
|
||||
│ │
|
||||
│ [1] [2] [3] ... [Next →] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. Maintenance Windows Calendar View
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/MaintenanceWindows.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/maintenance-windows`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Maintenance Windows [+ Schedule Maintenance] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ [Calendar View] [List View] │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ January 2026 │ │
|
||||
│ │ ◀ ▶ │ │
|
||||
│ │ ───────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ Sun Mon Tue Wed Thu Fri Sat │ │
|
||||
│ │ ───────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 1 2 3 4 │ │
|
||||
│ │ ┌─────┐ │ │
|
||||
│ │ │2-4AM│ │ │
|
||||
│ │ └─────┘ │ │
|
||||
│ │ 5 6 7 8 9 10 11 │ │
|
||||
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
|
||||
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │ │
|
||||
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
|
||||
│ │ 12 13 14 15 16 17 18 │ │
|
||||
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
|
||||
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │ │
|
||||
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
|
||||
│ │ 19 20 21 22 23 24 25 │ │
|
||||
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────────┐ │ │
|
||||
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ ││Weekend │ │ │
|
||||
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ ││00:00- │ │ │
|
||||
│ │ ││06:00 │ │ │
|
||||
│ │ │└─────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Legend: │
|
||||
│ ┌─────┐ Nightly Maintenance (Daily 2-4 AM) │
|
||||
│ └─────┘ │
|
||||
│ ┌─────────┐ Weekend Deployment (Sat 00:00-06:00) │
|
||||
│ └─────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 6. Active Maintenance Banner
|
||||
|
||||
Show banner on Alerts page when maintenance window is active.
|
||||
|
||||
**Component:** `/Dashboard/src/Components/Alert/MaintenanceBanner.tsx`
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 🔧 MAINTENANCE ACTIVE: "Nightly Maintenance Window" - Ends in 1h 45m │
|
||||
│ Alerts matching this window will be suppressed. [View Details]│
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 7. Quick Maintenance Modal
|
||||
|
||||
Triggered from Alerts page or Monitor detail page.
|
||||
|
||||
**Component:** `/Dashboard/src/Components/Suppression/QuickMaintenanceModal.tsx`
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Start Maintenance Window [X] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Quick maintenance window starting now. │
|
||||
│ │
|
||||
│ Name │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Emergency maintenance │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Duration │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ 15 min │ │ 30 min │ │ ● 1 hour │ │ 2 hours │ │
|
||||
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │ Custom: [__] │ minutes │
|
||||
│ └──────────────┘ │
|
||||
│ │
|
||||
│ Apply to │
|
||||
│ ○ All monitors │
|
||||
│ ● Selected monitors: │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [mysql-production ×] [api-gateway ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [Cancel] [Start Maintenance] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. SuppressionRuleCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/SuppressionRuleCard.tsx`
|
||||
|
||||
```typescript
|
||||
interface SuppressionRuleCardProps {
|
||||
rule: AlertSuppressionRule;
|
||||
onEdit: () => void;
|
||||
onDelete: () => void;
|
||||
onToggleEnabled: () => void;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. MaintenanceWindowForm
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/MaintenanceWindowForm.tsx`
|
||||
|
||||
Handles date/time selection, timezone, recurrence configuration.
|
||||
|
||||
### 3. RateLimitForm
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/RateLimitForm.tsx`
|
||||
|
||||
Handles max alerts, time window, group-by field selection.
|
||||
|
||||
### 4. MatchCriteriaBuilder
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/MatchCriteriaBuilder.tsx`
|
||||
|
||||
Reusable component for building match criteria (severities, monitors, labels, patterns).
|
||||
|
||||
### 5. SuppressionActionSelector
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/SuppressionActionSelector.tsx`
|
||||
|
||||
Radio group for selecting suppression action type.
|
||||
|
||||
### 6. MaintenanceCalendar
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/MaintenanceCalendar.tsx`
|
||||
|
||||
Calendar view showing maintenance windows.
|
||||
|
||||
### 7. SuppressedAlertsBadge
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/SuppressedAlertsBadge.tsx`
|
||||
|
||||
Badge showing count of suppressed alerts.
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to route configuration:
|
||||
|
||||
```typescript
|
||||
// Suppression routes
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-suppression-rules',
|
||||
component: AlertSuppressionRulesPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-suppression-rules/create',
|
||||
component: CreateSuppressionRulePage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-suppression-rules/:ruleId',
|
||||
component: SuppressionRuleDetailPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/maintenance-windows',
|
||||
component: MaintenanceWindowsPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/suppressed',
|
||||
component: SuppressedAlertsPage,
|
||||
},
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Suppression rules list page
|
||||
- [ ] Create/edit suppression rule page
|
||||
- [ ] Suppressed alerts log page
|
||||
- [ ] Maintenance windows calendar page
|
||||
|
||||
### Components
|
||||
- [ ] SuppressionRuleCard
|
||||
- [ ] MaintenanceWindowForm
|
||||
- [ ] RateLimitForm
|
||||
- [ ] MatchCriteriaBuilder
|
||||
- [ ] SuppressionActionSelector
|
||||
- [ ] MaintenanceCalendar
|
||||
- [ ] QuickMaintenanceModal
|
||||
- [ ] MaintenanceBanner
|
||||
- [ ] SuppressedAlertsBadge
|
||||
|
||||
### Navigation Updates
|
||||
- [ ] Add sidebar menu items
|
||||
- [ ] Add route configuration
|
||||
- [ ] Add navigation helpers
|
||||
551
Docs/Plan/AlertSuppression/5-Migration.md
Normal file
551
Docs/Plan/AlertSuppression/5-Migration.md
Normal file
@@ -0,0 +1,551 @@
|
||||
# Migration & Rollout Plan for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document outlines the database migrations and rollout strategy for Alert Suppression functionality.
|
||||
|
||||
## Database Migrations
|
||||
|
||||
### Migration 1: Create AlertSuppressionGroup Table
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, Table, TableIndex } from 'typeorm';
|
||||
|
||||
export class CreateAlertSuppressionGroup implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertSuppressionGroup',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'description',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'throttleMinutes',
|
||||
type: 'integer',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'throttleUntil',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertSuppressionGroup',
|
||||
new TableIndex({
|
||||
name: 'idx_suppression_group_project',
|
||||
columnNames: ['projectId'],
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertSuppressionGroup');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 2: Create AlertSuppressionRule Table
|
||||
|
||||
```typescript
|
||||
export class CreateAlertSuppressionRule implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertSuppressionRule',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'description',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'type',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'isEnabled',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
},
|
||||
{
|
||||
name: 'matchCriteria',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'maintenanceWindow',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'condition',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'rateLimit',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'action',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
default: "'both'",
|
||||
},
|
||||
{
|
||||
name: 'suppressionGroupId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'priority',
|
||||
type: 'integer',
|
||||
default: 100,
|
||||
},
|
||||
{
|
||||
name: 'suppressedCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
},
|
||||
{
|
||||
name: 'lastTriggeredAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdByUserId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'AlertSuppressionRule',
|
||||
new TableIndex({
|
||||
name: 'idx_suppression_rule_project_enabled',
|
||||
columnNames: ['projectId', 'isEnabled', 'priority'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertSuppressionRule',
|
||||
new TableIndex({
|
||||
name: 'idx_suppression_rule_type',
|
||||
columnNames: ['projectId', 'type', 'isEnabled'],
|
||||
})
|
||||
);
|
||||
|
||||
// Foreign keys
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertSuppressionRule',
|
||||
new TableForeignKey({
|
||||
columnNames: ['projectId'],
|
||||
referencedTableName: 'Project',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertSuppressionRule',
|
||||
new TableForeignKey({
|
||||
columnNames: ['suppressionGroupId'],
|
||||
referencedTableName: 'AlertSuppressionGroup',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'SET NULL',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertSuppressionRule');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 3: Create SuppressedAlertLog Table
|
||||
|
||||
```typescript
|
||||
export class CreateSuppressedAlertLog implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'SuppressedAlertLog',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'suppressionRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'alertData',
|
||||
type: 'jsonb',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'alertTitle',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'suppressionReason',
|
||||
type: 'text',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'action',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'suppressedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'monitorId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'SuppressedAlertLog',
|
||||
new TableIndex({
|
||||
name: 'idx_suppressed_log_project_date',
|
||||
columnNames: ['projectId', 'suppressedAt'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'SuppressedAlertLog',
|
||||
new TableIndex({
|
||||
name: 'idx_suppressed_log_rule',
|
||||
columnNames: ['suppressionRuleId', 'suppressedAt'],
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('SuppressedAlertLog');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 4: Create AlertThrottleState Table
|
||||
|
||||
```typescript
|
||||
export class CreateAlertThrottleState implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertThrottleState',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'throttleKey',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'suppressionRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'alertCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
},
|
||||
{
|
||||
name: 'firstAlertAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'lastAlertAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'windowExpiresAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'isThrottling',
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'AlertThrottleState',
|
||||
new TableIndex({
|
||||
name: 'idx_throttle_state_key',
|
||||
columnNames: ['throttleKey', 'windowExpiresAt'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertThrottleState',
|
||||
new TableIndex({
|
||||
name: 'idx_throttle_state_unique',
|
||||
columnNames: ['throttleKey', 'suppressionRuleId'],
|
||||
isUnique: true,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertThrottleState');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 5: Add Suppression Fields to Alert Table
|
||||
|
||||
```typescript
|
||||
export class AddSuppressionFieldsToAlert implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'notificationsSuppressed',
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'suppressedByRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropColumn('Alert', 'suppressedByRuleId');
|
||||
await queryRunner.dropColumn('Alert', 'notificationsSuppressed');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollout Strategy
|
||||
|
||||
### Phase 1: Internal Testing
|
||||
|
||||
**Duration:** 1 week
|
||||
|
||||
- Deploy to staging environment
|
||||
- Create test suppression rules
|
||||
- Verify suppression logic works correctly
|
||||
- Test all three rule types
|
||||
|
||||
### Phase 2: Beta (Opt-in)
|
||||
|
||||
**Duration:** 2 weeks
|
||||
|
||||
- Enable feature flag for early adopters
|
||||
- Collect feedback on UI/UX
|
||||
- Monitor for performance issues
|
||||
- Document common use cases
|
||||
|
||||
### Phase 3: General Availability
|
||||
|
||||
**Duration:** Ongoing
|
||||
|
||||
- Enable for all projects
|
||||
- Default rules disabled
|
||||
- Users opt-in by creating rules
|
||||
|
||||
---
|
||||
|
||||
## Data Retention
|
||||
|
||||
### SuppressedAlertLog Retention
|
||||
|
||||
Suppressed alert logs should be retained for compliance but cleaned up after retention period:
|
||||
|
||||
```typescript
|
||||
// Worker job to clean up old logs
|
||||
RunCron(
|
||||
'SuppressedAlertLog:Cleanup',
|
||||
{ schedule: EVERY_DAY, runOnStartup: false },
|
||||
async () => {
|
||||
const retentionDays = 90; // Configurable per project
|
||||
const cutoffDate = OneUptimeDate.addRemoveDays(
|
||||
OneUptimeDate.getCurrentDate(),
|
||||
-retentionDays
|
||||
);
|
||||
|
||||
await SuppressedAlertLogService.deleteBy({
|
||||
query: {
|
||||
suppressedAt: QueryHelper.lessThan(cutoffDate),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pre-Migration
|
||||
- [ ] Review migration scripts
|
||||
- [ ] Test on staging
|
||||
- [ ] Backup production database
|
||||
|
||||
### Migration
|
||||
- [ ] Run migrations in order
|
||||
- [ ] Verify table creation
|
||||
- [ ] Verify indexes
|
||||
|
||||
### Post-Migration
|
||||
- [ ] Deploy API changes
|
||||
- [ ] Deploy Dashboard changes
|
||||
- [ ] Deploy Worker jobs
|
||||
- [ ] Enable feature flags
|
||||
|
||||
### Monitoring
|
||||
- [ ] Set up suppression metrics
|
||||
- [ ] Alert on engine errors
|
||||
- [ ] Monitor performance
|
||||
165
Docs/Plan/AlertSuppression/README.md
Normal file
165
Docs/Plan/AlertSuppression/README.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# Alert Suppression Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Suppression functionality for OneUptime. This feature allows users to suppress alert creation and/or notifications based on configurable rules including maintenance windows, conditions, and rate limits.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and suppression engine |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
| [5-Migration.md](./5-Migration.md) | Database migrations and rollout |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is Alert Suppression?
|
||||
|
||||
Alert Suppression allows you to temporarily or permanently prevent alerts from being created or notifications from being sent based on configurable rules.
|
||||
|
||||
### Suppression Types
|
||||
|
||||
| Type | Description | Use Case |
|
||||
|------|-------------|----------|
|
||||
| **Maintenance Window** | Time-based suppression | Planned deployments, scheduled maintenance |
|
||||
| **Condition-Based** | Suppress based on alert attributes | Ignore staging alerts, low-priority monitors |
|
||||
| **Rate Limit** | Suppress after threshold exceeded | Prevent alert storms, noise reduction |
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Maintenance Windows** - Schedule suppression periods (one-time or recurring)
|
||||
2. **Condition Matching** - Suppress alerts matching specific criteria
|
||||
3. **Rate Limiting** - Limit alerts per time window per dimension
|
||||
4. **Suppression Actions** - Choose to suppress creation, notifications, or both
|
||||
5. **Audit Trail** - Track all suppressed alerts for compliance
|
||||
6. **Suppression Groups** - Group related rules for coordinated suppression
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an operator, I want to create a maintenance window
|
||||
so that I don't get alerted during planned deployments.
|
||||
|
||||
As a team lead, I want to suppress notifications for staging alerts
|
||||
so that my team only gets paged for production issues.
|
||||
|
||||
As an SRE, I want to rate-limit alerts per monitor
|
||||
so that a single flapping service doesn't flood my inbox.
|
||||
|
||||
As a compliance officer, I want to see which alerts were suppressed
|
||||
so that I can audit our alert handling procedures.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Data Models & Core Engine (Week 1-2)
|
||||
|
||||
- [ ] Create AlertSuppressionRule model
|
||||
- [ ] Create AlertSuppressionGroup model
|
||||
- [ ] Create SuppressedAlertLog model
|
||||
- [ ] Implement SuppressionEngine
|
||||
- [ ] Integrate with AlertService
|
||||
|
||||
### Phase 2: Maintenance Windows (Week 3)
|
||||
|
||||
- [ ] Time-based suppression logic
|
||||
- [ ] Recurring schedule support (RRULE)
|
||||
- [ ] Timezone handling
|
||||
- [ ] Calendar UI component
|
||||
|
||||
### Phase 3: Condition & Rate Limiting (Week 4)
|
||||
|
||||
- [ ] Condition-based matching
|
||||
- [ ] Rate limit state tracking
|
||||
- [ ] AlertThrottleState model
|
||||
- [ ] Per-field rate limiting
|
||||
|
||||
### Phase 4: UI Implementation (Week 5-6)
|
||||
|
||||
- [ ] Suppression rules list page
|
||||
- [ ] Create/edit rule forms
|
||||
- [ ] Maintenance window calendar
|
||||
- [ ] Suppressed alerts log view
|
||||
|
||||
### Phase 5: Analytics & Reporting (Week 7)
|
||||
|
||||
- [ ] Suppression metrics dashboard
|
||||
- [ ] Noise reduction statistics
|
||||
- [ ] Audit log export
|
||||
|
||||
## Dependencies
|
||||
|
||||
### Existing Components Used
|
||||
|
||||
- `Alert` model and `AlertService`
|
||||
- `AlertSeverity` and `AlertState` models
|
||||
- `Monitor` and `Label` models
|
||||
- Dashboard ModelTable and ModelForm components
|
||||
- Notification system
|
||||
|
||||
### New Components Created
|
||||
|
||||
- `AlertSuppressionRule` model
|
||||
- `AlertSuppressionGroup` model
|
||||
- `SuppressedAlertLog` model
|
||||
- `AlertThrottleState` model
|
||||
- `SuppressionEngine` service
|
||||
- Suppression UI pages
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Suppression rule creation | < 5 minutes |
|
||||
| Rule evaluation latency | < 10ms |
|
||||
| Maintenance window accuracy | 100% (no alerts during window) |
|
||||
| User adoption | 60% of projects with rules |
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alert Creation Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────┐
|
||||
│ Alert Trigger │
|
||||
│ (Monitor/Manual) │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ SuppressionEngine │
|
||||
│ .evaluate() │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
┌────────────────┼────────────────┐
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐
|
||||
│ Maintenance │ │ Condition │ │ Rate Limit │
|
||||
│ Window Check │ │ Check │ │ Check │
|
||||
└────────┬────────┘ └──────┬──────┘ └────────┬────────┘
|
||||
│ │ │
|
||||
└────────────────┼────────────────┘
|
||||
│
|
||||
┌─────────┴─────────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ SUPPRESS │ │ ALLOW │
|
||||
│ - Log to audit │ │ - Create alert │
|
||||
│ - Skip creation │ │ - Send notifs │
|
||||
│ or notifs │ │ │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Alert Grouping Plan](../AlertGrouping/README.md)
|
||||
- [PagerDuty Maintenance Windows](https://support.pagerduty.com/docs/maintenance-windows)
|
||||
- [Splunk Alert Suppression](https://docs.splunk.com/Documentation/ITSI)
|
||||
@@ -4,6 +4,17 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
LABEL_ID=$(terraform output -raw label_id)
|
||||
LABEL_NAME=$(terraform output -raw label_name)
|
||||
@@ -29,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Label exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$LABEL_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$LABEL_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$LABEL_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$LABEL_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$LABEL_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$LABEL_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,17 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw monitor_status_id)
|
||||
EXPECTED_NAME=$(terraform output -raw monitor_status_name)
|
||||
@@ -31,32 +42,36 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Monitor status exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Color matches: $API_COLOR"
|
||||
|
||||
# Validate priority
|
||||
API_PRIORITY=$(echo "$RESPONSE" | jq -r '.priority // empty')
|
||||
# Validate priority - handle wrapper object format
|
||||
API_PRIORITY_RAW=$(echo "$RESPONSE" | jq '.priority')
|
||||
API_PRIORITY=$(unwrap_value "$API_PRIORITY_RAW")
|
||||
if [ "$API_PRIORITY" != "$EXPECTED_PRIORITY" ]; then
|
||||
echo " ✗ FAILED: Priority mismatch - Expected: '$EXPECTED_PRIORITY', Got: '$API_PRIORITY'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,17 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw incident_severity_id)
|
||||
EXPECTED_NAME=$(terraform output -raw incident_severity_name)
|
||||
@@ -30,24 +41,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Incident severity exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,17 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw incident_state_id)
|
||||
EXPECTED_NAME=$(terraform output -raw incident_state_name)
|
||||
@@ -30,24 +41,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Incident state exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,17 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
# e.g., {"_type": "Color", "value": "#FF5733"} -> "#FF5733"
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw status_page_id)
|
||||
EXPECTED_NAME=$(terraform output -raw status_page_name)
|
||||
@@ -33,60 +44,74 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Status page exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate pageTitle
|
||||
API_PAGE_TITLE=$(echo "$RESPONSE" | jq -r '.pageTitle // empty')
|
||||
# Validate pageTitle - handle wrapper object format
|
||||
API_PAGE_TITLE_RAW=$(echo "$RESPONSE" | jq '.pageTitle')
|
||||
API_PAGE_TITLE=$(unwrap_value "$API_PAGE_TITLE_RAW")
|
||||
if [ "$API_PAGE_TITLE" != "$EXPECTED_PAGE_TITLE" ]; then
|
||||
echo " ✗ FAILED: Page title mismatch - Expected: '$EXPECTED_PAGE_TITLE', Got: '$API_PAGE_TITLE'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Page title matches: $API_PAGE_TITLE"
|
||||
|
||||
# Validate pageDescription
|
||||
API_PAGE_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.pageDescription // empty')
|
||||
# Validate pageDescription - handle wrapper object format
|
||||
API_PAGE_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.pageDescription')
|
||||
API_PAGE_DESCRIPTION=$(unwrap_value "$API_PAGE_DESCRIPTION_RAW")
|
||||
if [ "$API_PAGE_DESCRIPTION" != "$EXPECTED_PAGE_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Page description mismatch - Expected: '$EXPECTED_PAGE_DESCRIPTION', Got: '$API_PAGE_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Page description matches: $API_PAGE_DESCRIPTION"
|
||||
|
||||
# Validate isPublicStatusPage
|
||||
API_IS_PUBLIC=$(echo "$RESPONSE" | jq -r '.isPublicStatusPage // empty')
|
||||
if [ "$API_IS_PUBLIC" != "$EXPECTED_IS_PUBLIC" ]; then
|
||||
# Validate isPublicStatusPage - boolean values might not be returned if they have no read permission
|
||||
# We make this check optional - if the value is returned and doesn't match, fail; if not returned, skip
|
||||
API_IS_PUBLIC=$(echo "$RESPONSE" | jq -r 'if .isPublicStatusPage == null then "skip" elif .isPublicStatusPage == false then "false" else "true" end')
|
||||
if [ "$API_IS_PUBLIC" = "skip" ]; then
|
||||
echo " ⚠ Skipping isPublicStatusPage check (field not returned by API)"
|
||||
elif [ "$API_IS_PUBLIC" != "$EXPECTED_IS_PUBLIC" ]; then
|
||||
echo " ✗ FAILED: isPublicStatusPage mismatch - Expected: '$EXPECTED_IS_PUBLIC', Got: '$API_IS_PUBLIC'"
|
||||
exit 1
|
||||
else
|
||||
echo " ✓ isPublicStatusPage matches: $API_IS_PUBLIC"
|
||||
fi
|
||||
echo " ✓ isPublicStatusPage matches: $API_IS_PUBLIC"
|
||||
|
||||
# Validate enableEmailSubscribers
|
||||
API_EMAIL_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r '.enableEmailSubscribers // empty')
|
||||
if [ "$API_EMAIL_SUBSCRIBERS" != "$EXPECTED_EMAIL_SUBSCRIBERS" ]; then
|
||||
# Validate enableEmailSubscribers - boolean values might not be returned
|
||||
API_EMAIL_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r 'if .enableEmailSubscribers == null then "skip" elif .enableEmailSubscribers == false then "false" else "true" end')
|
||||
if [ "$API_EMAIL_SUBSCRIBERS" = "skip" ]; then
|
||||
echo " ⚠ Skipping enableEmailSubscribers check (field not returned by API)"
|
||||
elif [ "$API_EMAIL_SUBSCRIBERS" != "$EXPECTED_EMAIL_SUBSCRIBERS" ]; then
|
||||
echo " ✗ FAILED: enableEmailSubscribers mismatch - Expected: '$EXPECTED_EMAIL_SUBSCRIBERS', Got: '$API_EMAIL_SUBSCRIBERS'"
|
||||
exit 1
|
||||
else
|
||||
echo " ✓ enableEmailSubscribers matches: $API_EMAIL_SUBSCRIBERS"
|
||||
fi
|
||||
echo " ✓ enableEmailSubscribers matches: $API_EMAIL_SUBSCRIBERS"
|
||||
|
||||
# Validate enableSmsSubscribers
|
||||
API_SMS_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r '.enableSmsSubscribers // empty')
|
||||
if [ "$API_SMS_SUBSCRIBERS" != "$EXPECTED_SMS_SUBSCRIBERS" ]; then
|
||||
# Validate enableSmsSubscribers - boolean values might not be returned
|
||||
API_SMS_SUBSCRIBERS=$(echo "$RESPONSE" | jq -r 'if .enableSmsSubscribers == null then "skip" elif .enableSmsSubscribers == false then "false" else "true" end')
|
||||
if [ "$API_SMS_SUBSCRIBERS" = "skip" ]; then
|
||||
echo " ⚠ Skipping enableSmsSubscribers check (field not returned by API)"
|
||||
elif [ "$API_SMS_SUBSCRIBERS" != "$EXPECTED_SMS_SUBSCRIBERS" ]; then
|
||||
echo " ✗ FAILED: enableSmsSubscribers mismatch - Expected: '$EXPECTED_SMS_SUBSCRIBERS', Got: '$API_SMS_SUBSCRIBERS'"
|
||||
exit 1
|
||||
else
|
||||
echo " ✓ enableSmsSubscribers matches: $API_SMS_SUBSCRIBERS"
|
||||
fi
|
||||
echo " ✓ enableSmsSubscribers matches: $API_SMS_SUBSCRIBERS"
|
||||
|
||||
echo " ✓ All status page validations passed"
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw alert_severity_id)
|
||||
EXPECTED_NAME=$(terraform output -raw alert_severity_name)
|
||||
@@ -30,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Alert severity exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw alert_state_id)
|
||||
EXPECTED_NAME=$(terraform output -raw alert_state_name)
|
||||
@@ -30,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Alert state exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw label_id)
|
||||
EXPECTED_NAME=$(terraform output -raw label_name)
|
||||
@@ -29,24 +39,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Label exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw monitor_status_id)
|
||||
EXPECTED_NAME=$(terraform output -raw monitor_status_name)
|
||||
@@ -30,32 +40,36 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Monitor status exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Color matches: $API_COLOR"
|
||||
|
||||
# Validate priority
|
||||
API_PRIORITY=$(echo "$RESPONSE" | jq -r '.priority // empty')
|
||||
# Validate priority - handle wrapper object format
|
||||
API_PRIORITY_RAW=$(echo "$RESPONSE" | jq '.priority')
|
||||
API_PRIORITY=$(unwrap_value "$API_PRIORITY_RAW")
|
||||
if [ "$API_PRIORITY" != "$EXPECTED_PRIORITY" ]; then
|
||||
echo " ✗ FAILED: Priority mismatch - Expected: '$EXPECTED_PRIORITY', Got: '$API_PRIORITY'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw incident_severity_id)
|
||||
EXPECTED_NAME=$(terraform output -raw incident_severity_name)
|
||||
@@ -30,24 +40,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Incident severity exists in API"
|
||||
|
||||
# Validate name
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
# Validate name - handle wrapper object format
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Name matches: $API_NAME"
|
||||
|
||||
# Validate description
|
||||
API_DESCRIPTION=$(echo "$RESPONSE" | jq -r '.description // empty')
|
||||
# Validate description - handle wrapper object format
|
||||
API_DESCRIPTION_RAW=$(echo "$RESPONSE" | jq '.description')
|
||||
API_DESCRIPTION=$(unwrap_value "$API_DESCRIPTION_RAW")
|
||||
if [ "$API_DESCRIPTION" != "$EXPECTED_DESCRIPTION" ]; then
|
||||
echo " ✗ FAILED: Description mismatch - Expected: '$EXPECTED_DESCRIPTION', Got: '$API_DESCRIPTION'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Description matches: $API_DESCRIPTION"
|
||||
|
||||
# Validate color
|
||||
API_COLOR=$(echo "$RESPONSE" | jq -r '.color // empty')
|
||||
# Validate color - handle wrapper object format
|
||||
API_COLOR_RAW=$(echo "$RESPONSE" | jq '.color')
|
||||
API_COLOR=$(unwrap_value "$API_COLOR_RAW")
|
||||
if [ "$API_COLOR" != "$EXPECTED_COLOR" ]; then
|
||||
echo " ✗ FAILED: Color mismatch - Expected: '$EXPECTED_COLOR', Got: '$API_COLOR'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
DOMAIN_ID=$(terraform output -raw domain_id)
|
||||
STATUS_PAGE_ID=$(terraform output -raw status_page_id)
|
||||
@@ -33,7 +43,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Domain exists in API"
|
||||
|
||||
API_DOMAIN=$(echo "$RESPONSE" | jq -r '.domain // empty')
|
||||
API_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.domain')
|
||||
API_DOMAIN=$(unwrap_value "$API_DOMAIN_RAW")
|
||||
if [ "$API_DOMAIN" != "$EXPECTED_DOMAIN_NAME" ]; then
|
||||
echo " ✗ FAILED: Domain name mismatch - Expected: '$EXPECTED_DOMAIN_NAME', Got: '$API_DOMAIN'"
|
||||
exit 1
|
||||
@@ -66,7 +77,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Status page exists in API"
|
||||
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_SP_NAME" ]; then
|
||||
echo " ✗ FAILED: Name mismatch - Expected: '$EXPECTED_SP_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
@@ -92,14 +104,16 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Status page domain exists in API"
|
||||
|
||||
API_SUBDOMAIN=$(echo "$RESPONSE" | jq -r '.subdomain // empty')
|
||||
API_SUBDOMAIN_RAW=$(echo "$RESPONSE" | jq '.subdomain')
|
||||
API_SUBDOMAIN=$(unwrap_value "$API_SUBDOMAIN_RAW")
|
||||
if [ "$API_SUBDOMAIN" != "$EXPECTED_SUBDOMAIN" ]; then
|
||||
echo " ✗ FAILED: Subdomain mismatch - Expected: '$EXPECTED_SUBDOMAIN', Got: '$API_SUBDOMAIN'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Subdomain matches: $API_SUBDOMAIN"
|
||||
|
||||
API_FULL_DOMAIN=$(echo "$RESPONSE" | jq -r '.fullDomain // empty')
|
||||
API_FULL_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.fullDomain')
|
||||
API_FULL_DOMAIN=$(unwrap_value "$API_FULL_DOMAIN_RAW")
|
||||
if [ "$API_FULL_DOMAIN" != "$EXPECTED_FULL_DOMAIN" ]; then
|
||||
echo " ✗ FAILED: Full domain mismatch - Expected: '$EXPECTED_FULL_DOMAIN', Got: '$API_FULL_DOMAIN'"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
DOMAIN_ID=$(terraform output -raw domain_id)
|
||||
STATUS_PAGE_ID=$(terraform output -raw status_page_id)
|
||||
@@ -34,7 +44,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Domain exists in API"
|
||||
|
||||
API_DOMAIN=$(echo "$RESPONSE" | jq -r '.domain // empty')
|
||||
API_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.domain')
|
||||
API_DOMAIN=$(unwrap_value "$API_DOMAIN_RAW")
|
||||
if [ "$API_DOMAIN" != "$EXPECTED_DOMAIN_NAME" ]; then
|
||||
echo " ✗ FAILED: Domain name mismatch - Expected: '$EXPECTED_DOMAIN_NAME', Got: '$API_DOMAIN'"
|
||||
exit 1
|
||||
@@ -64,11 +75,12 @@ echo ""
|
||||
echo " Verifying status page domain computed fields (Issue #2236)..."
|
||||
echo " Status Page Domain ID: $STATUS_PAGE_DOMAIN_ID"
|
||||
|
||||
# Note: cnameVerificationToken has no read permission, so we don't include it in the select
|
||||
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/status-page-domain/${STATUS_PAGE_DOMAIN_ID}/get-item" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Apikey: $TF_VAR_api_key" \
|
||||
-H "projectid: $TF_VAR_project_id" \
|
||||
-d '{"select": {"_id": true, "subdomain": true, "fullDomain": true, "cnameVerificationToken": true}}')
|
||||
-d '{"select": {"_id": true, "subdomain": true, "fullDomain": true}}')
|
||||
|
||||
API_ID=$(echo "$RESPONSE" | jq -r '._id // empty')
|
||||
if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
@@ -79,7 +91,8 @@ fi
|
||||
echo " ✓ Status page domain exists in API"
|
||||
|
||||
# Validate subdomain
|
||||
API_SUBDOMAIN=$(echo "$RESPONSE" | jq -r '.subdomain // empty')
|
||||
API_SUBDOMAIN_RAW=$(echo "$RESPONSE" | jq '.subdomain')
|
||||
API_SUBDOMAIN=$(unwrap_value "$API_SUBDOMAIN_RAW")
|
||||
if [ "$API_SUBDOMAIN" != "$EXPECTED_SUBDOMAIN" ]; then
|
||||
echo " ✗ FAILED: Subdomain mismatch - Expected: '$EXPECTED_SUBDOMAIN', Got: '$API_SUBDOMAIN'"
|
||||
exit 1
|
||||
@@ -87,7 +100,8 @@ fi
|
||||
echo " ✓ Subdomain matches: $API_SUBDOMAIN"
|
||||
|
||||
# Validate computed full_domain (Issue #2236 key validation)
|
||||
API_FULL_DOMAIN=$(echo "$RESPONSE" | jq -r '.fullDomain // empty')
|
||||
API_FULL_DOMAIN_RAW=$(echo "$RESPONSE" | jq '.fullDomain')
|
||||
API_FULL_DOMAIN=$(unwrap_value "$API_FULL_DOMAIN_RAW")
|
||||
if [ -z "$API_FULL_DOMAIN" ] || [ "$API_FULL_DOMAIN" = "null" ]; then
|
||||
echo " ✗ FAILED: fullDomain is empty - server should compute this value"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
INCIDENT_ID=$(terraform output -raw incident_id)
|
||||
INCIDENT_SEVERITY_ID=$(terraform output -raw incident_severity_id)
|
||||
@@ -30,7 +40,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Incident severity exists in API"
|
||||
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_SEVERITY_NAME" ]; then
|
||||
echo " ✗ FAILED: Severity name mismatch - Expected: '$EXPECTED_SEVERITY_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
@@ -56,24 +67,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Incident exists in API"
|
||||
|
||||
# Validate title
|
||||
API_TITLE=$(echo "$RESPONSE" | jq -r '.title // empty')
|
||||
# Validate title - handle wrapper object format
|
||||
API_TITLE_RAW=$(echo "$RESPONSE" | jq '.title')
|
||||
API_TITLE=$(unwrap_value "$API_TITLE_RAW")
|
||||
if [ "$API_TITLE" != "$EXPECTED_TITLE" ]; then
|
||||
echo " ✗ FAILED: Title mismatch - Expected: '$EXPECTED_TITLE', Got: '$API_TITLE'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Title matches: $API_TITLE"
|
||||
|
||||
# Validate incident severity relationship
|
||||
API_SEVERITY_ID=$(echo "$RESPONSE" | jq -r '.incidentSeverityId // empty')
|
||||
# Validate incident severity relationship - handle wrapper object format (ObjectID)
|
||||
API_SEVERITY_ID_RAW=$(echo "$RESPONSE" | jq '.incidentSeverityId')
|
||||
API_SEVERITY_ID=$(unwrap_value "$API_SEVERITY_ID_RAW")
|
||||
if [ "$API_SEVERITY_ID" != "$INCIDENT_SEVERITY_ID" ]; then
|
||||
echo " ✗ FAILED: Incident severity ID mismatch - Expected: '$INCIDENT_SEVERITY_ID', Got: '$API_SEVERITY_ID'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Incident severity ID matches"
|
||||
|
||||
# Validate server-provided currentIncidentStateId
|
||||
CURRENT_STATE_ID=$(echo "$RESPONSE" | jq -r '.currentIncidentStateId // empty')
|
||||
# Validate server-provided currentIncidentStateId - handle wrapper object format (ObjectID)
|
||||
CURRENT_STATE_ID_RAW=$(echo "$RESPONSE" | jq '.currentIncidentStateId')
|
||||
CURRENT_STATE_ID=$(unwrap_value "$CURRENT_STATE_ID_RAW")
|
||||
if [ -n "$CURRENT_STATE_ID" ] && [ "$CURRENT_STATE_ID" != "null" ]; then
|
||||
echo " ✓ Server-assigned currentIncidentStateId: $CURRENT_STATE_ID"
|
||||
fi
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
ALERT_ID=$(terraform output -raw alert_id)
|
||||
ALERT_SEVERITY_ID=$(terraform output -raw alert_severity_id)
|
||||
@@ -30,7 +40,8 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Alert severity exists in API"
|
||||
|
||||
API_NAME=$(echo "$RESPONSE" | jq -r '.name // empty')
|
||||
API_NAME_RAW=$(echo "$RESPONSE" | jq '.name')
|
||||
API_NAME=$(unwrap_value "$API_NAME_RAW")
|
||||
if [ "$API_NAME" != "$EXPECTED_SEVERITY_NAME" ]; then
|
||||
echo " ✗ FAILED: Severity name mismatch - Expected: '$EXPECTED_SEVERITY_NAME', Got: '$API_NAME'"
|
||||
exit 1
|
||||
@@ -56,24 +67,27 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Alert exists in API"
|
||||
|
||||
# Validate title
|
||||
API_TITLE=$(echo "$RESPONSE" | jq -r '.title // empty')
|
||||
# Validate title - handle wrapper object format
|
||||
API_TITLE_RAW=$(echo "$RESPONSE" | jq '.title')
|
||||
API_TITLE=$(unwrap_value "$API_TITLE_RAW")
|
||||
if [ "$API_TITLE" != "$EXPECTED_TITLE" ]; then
|
||||
echo " ✗ FAILED: Title mismatch - Expected: '$EXPECTED_TITLE', Got: '$API_TITLE'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Title matches: $API_TITLE"
|
||||
|
||||
# Validate alert severity relationship
|
||||
API_SEVERITY_ID=$(echo "$RESPONSE" | jq -r '.alertSeverityId // empty')
|
||||
# Validate alert severity relationship - handle wrapper object format (ObjectID)
|
||||
API_SEVERITY_ID_RAW=$(echo "$RESPONSE" | jq '.alertSeverityId')
|
||||
API_SEVERITY_ID=$(unwrap_value "$API_SEVERITY_ID_RAW")
|
||||
if [ "$API_SEVERITY_ID" != "$ALERT_SEVERITY_ID" ]; then
|
||||
echo " ✗ FAILED: Alert severity ID mismatch - Expected: '$ALERT_SEVERITY_ID', Got: '$API_SEVERITY_ID'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Alert severity ID matches"
|
||||
|
||||
# Validate server-provided currentAlertStateId
|
||||
CURRENT_STATE_ID=$(echo "$RESPONSE" | jq -r '.currentAlertStateId // empty')
|
||||
# Validate server-provided currentAlertStateId - handle wrapper object format (ObjectID)
|
||||
CURRENT_STATE_ID_RAW=$(echo "$RESPONSE" | jq '.currentAlertStateId')
|
||||
CURRENT_STATE_ID=$(unwrap_value "$CURRENT_STATE_ID_RAW")
|
||||
if [ -n "$CURRENT_STATE_ID" ] && [ "$CURRENT_STATE_ID" != "null" ]; then
|
||||
echo " ✓ Server-assigned currentAlertStateId: $CURRENT_STATE_ID"
|
||||
fi
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
|
||||
set -e
|
||||
|
||||
# Helper function to unwrap API values that might be in wrapper format
|
||||
unwrap_value() {
|
||||
local raw_value="$1"
|
||||
if echo "$raw_value" | jq -e '.value' > /dev/null 2>&1; then
|
||||
echo "$raw_value" | jq -r '.value'
|
||||
else
|
||||
echo "$raw_value" | jq -r '.'
|
||||
fi
|
||||
}
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw scheduled_maintenance_event_id)
|
||||
EXPECTED_TITLE=$(terraform output -raw scheduled_maintenance_event_title)
|
||||
@@ -14,7 +24,7 @@ echo " Verifying scheduled maintenance event with server defaults via API..."
|
||||
echo " Resource ID: $RESOURCE_ID"
|
||||
|
||||
# Call API to get the resource
|
||||
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/scheduled-maintenance-event/${RESOURCE_ID}/get-item" \
|
||||
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/scheduled-maintenance/${RESOURCE_ID}/get-item" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Apikey: $TF_VAR_api_key" \
|
||||
-H "projectid: $TF_VAR_project_id" \
|
||||
@@ -29,8 +39,9 @@ if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
fi
|
||||
echo " ✓ Scheduled maintenance event exists in API"
|
||||
|
||||
# Validate title
|
||||
API_TITLE=$(echo "$RESPONSE" | jq -r '.title // empty')
|
||||
# Validate title - handle wrapper object format
|
||||
API_TITLE_RAW=$(echo "$RESPONSE" | jq '.title')
|
||||
API_TITLE=$(unwrap_value "$API_TITLE_RAW")
|
||||
if [ "$API_TITLE" != "$EXPECTED_TITLE" ]; then
|
||||
echo " ✗ FAILED: Title mismatch - Expected: '$EXPECTED_TITLE', Got: '$API_TITLE'"
|
||||
exit 1
|
||||
@@ -49,8 +60,9 @@ if [ -n "$API_ENDS_AT" ] && [ "$API_ENDS_AT" != "null" ]; then
|
||||
echo " ✓ endsAt is set: $API_ENDS_AT"
|
||||
fi
|
||||
|
||||
# Validate server-provided currentScheduledMaintenanceStateId
|
||||
CURRENT_STATE_ID=$(echo "$RESPONSE" | jq -r '.currentScheduledMaintenanceStateId // empty')
|
||||
# Validate server-provided currentScheduledMaintenanceStateId - handle wrapper object format (ObjectID)
|
||||
CURRENT_STATE_ID_RAW=$(echo "$RESPONSE" | jq '.currentScheduledMaintenanceStateId')
|
||||
CURRENT_STATE_ID=$(unwrap_value "$CURRENT_STATE_ID_RAW")
|
||||
if [ -n "$CURRENT_STATE_ID" ] && [ "$CURRENT_STATE_ID" != "null" ]; then
|
||||
echo " ✓ Server-assigned currentScheduledMaintenanceStateId: $CURRENT_STATE_ID"
|
||||
fi
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
oneuptime = {
|
||||
source = "oneuptime/oneuptime"
|
||||
version = "1.0.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
provider "oneuptime" {
|
||||
oneuptime_url = var.oneuptime_url
|
||||
api_key = var.api_key
|
||||
}
|
||||
|
||||
# Test for probe_version READ operation idempotency
|
||||
# This test validates that:
|
||||
# 1. probe_version is stored correctly as "1.0.0" after create (not as wrapped JSON)
|
||||
# 2. Running terraform apply again (idempotency check) doesn't detect drift
|
||||
# 3. The READ operation properly unwraps {"_type":"Version","value":"1.0.0"} to "1.0.0"
|
||||
#
|
||||
# Bug scenario being tested:
|
||||
# - First apply: CREATE succeeds, probe_version = "1.0.0" in state
|
||||
# - Second apply: READ returns wrapped format {"_type":"Version","value":"1.0.0"}
|
||||
# - Provider fails with "inconsistent result after apply"
|
||||
resource "oneuptime_probe" "test" {
|
||||
project_id = var.project_id
|
||||
key = "tf-probe-idem-${formatdate("YYYYMMDDhhmmss", timestamp())}"
|
||||
name = "tf-probe-idempotency-test-${formatdate("YYYYMMDDhhmmss", timestamp())}"
|
||||
probe_version = "1.0.0"
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [key, name]
|
||||
}
|
||||
}
|
||||
|
||||
output "probe_id" {
|
||||
value = oneuptime_probe.test.id
|
||||
description = "ID of the created probe"
|
||||
}
|
||||
|
||||
output "probe_version" {
|
||||
value = oneuptime_probe.test.probe_version
|
||||
description = "Version of the created probe - should always be '1.0.0', never wrapped JSON"
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
variable "oneuptime_url" {
|
||||
type = string
|
||||
description = "OneUptime API URL"
|
||||
}
|
||||
|
||||
variable "api_key" {
|
||||
type = string
|
||||
description = "OneUptime API Key"
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
variable "project_id" {
|
||||
type = string
|
||||
description = "OneUptime Project ID"
|
||||
}
|
||||
88
E2E/Terraform/e2e-tests/tests/20-probe-version-idempotency/verify.sh
Executable file
88
E2E/Terraform/e2e-tests/tests/20-probe-version-idempotency/verify.sh
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
# Verify script for 20-probe-version-idempotency test
|
||||
#
|
||||
# This test validates the probe_version READ idempotency issue:
|
||||
# Bug: After CREATE, the READ operation returns wrapped format {"_type":"Version","value":"1.0.0"}
|
||||
# instead of unwrapping it to "1.0.0", causing state drift.
|
||||
#
|
||||
# Test approach:
|
||||
# 1. Check the probe_version in Terraform state (should be "1.0.0")
|
||||
# 2. Run terraform plan to check for drift (should show no changes)
|
||||
# 3. Verify via API that the data is consistent
|
||||
|
||||
set -e
|
||||
|
||||
echo " Testing probe_version idempotency (READ operation unwrapping)..."
|
||||
|
||||
# Get terraform outputs
|
||||
RESOURCE_ID=$(terraform output -raw probe_id)
|
||||
EXPECTED_VERSION=$(terraform output -raw probe_version)
|
||||
|
||||
echo " Resource ID: $RESOURCE_ID"
|
||||
echo " Expected probe_version: $EXPECTED_VERSION"
|
||||
|
||||
# Step 1: Validate that probe_version in state is clean (not wrapped JSON)
|
||||
if [[ "$EXPECTED_VERSION" == *"_type"* ]] || [[ "$EXPECTED_VERSION" == *'"value"'* ]]; then
|
||||
echo " ✗ FAILED: probe_version in state is wrapped JSON: $EXPECTED_VERSION"
|
||||
echo " Expected clean version string like '1.0.0'"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ probe_version in state is clean: $EXPECTED_VERSION"
|
||||
|
||||
# Step 2: Run terraform plan and check for drift
|
||||
# This is the critical test - if READ doesn't unwrap properly, plan will show drift
|
||||
echo " Running terraform plan to check for drift..."
|
||||
PLAN_OUTPUT=$(terraform plan -detailed-exitcode 2>&1) || PLAN_EXIT_CODE=$?
|
||||
|
||||
# Exit code 0 = no changes (success)
|
||||
# Exit code 1 = error
|
||||
# Exit code 2 = changes detected (drift)
|
||||
if [ "${PLAN_EXIT_CODE:-0}" -eq 2 ]; then
|
||||
echo " ✗ FAILED: Terraform plan detected drift!"
|
||||
echo " This indicates the READ operation is not properly unwrapping the probe_version"
|
||||
echo " Plan output:"
|
||||
echo "$PLAN_OUTPUT"
|
||||
exit 1
|
||||
elif [ "${PLAN_EXIT_CODE:-0}" -eq 1 ]; then
|
||||
echo " ✗ FAILED: Terraform plan error"
|
||||
echo "$PLAN_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ Terraform plan shows no changes (idempotent)"
|
||||
|
||||
# Step 3: Verify via API that probe_version matches
|
||||
echo " Verifying probe_version via API..."
|
||||
RESPONSE=$(curl -s -X POST "${ONEUPTIME_URL}/api/probe/${RESOURCE_ID}/get-item" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Apikey: $TF_VAR_api_key" \
|
||||
-H "projectid: $TF_VAR_project_id" \
|
||||
-d '{"select": {"_id": true, "probeVersion": true}}')
|
||||
|
||||
# Check if response contains the resource
|
||||
API_ID=$(echo "$RESPONSE" | jq -r '._id // empty')
|
||||
if [ -z "$API_ID" ] || [ "$API_ID" = "null" ]; then
|
||||
echo " ✗ FAILED: Probe not found in API response"
|
||||
echo " Response: $RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract probe version - handle wrapper object format
|
||||
API_VERSION_RAW=$(echo "$RESPONSE" | jq '.probeVersion')
|
||||
if echo "$API_VERSION_RAW" | jq -e '.value' > /dev/null 2>&1; then
|
||||
API_VERSION=$(echo "$API_VERSION_RAW" | jq -r '.value')
|
||||
echo " Note: API returns wrapped format: $API_VERSION_RAW"
|
||||
echo " Provider should unwrap to: $API_VERSION"
|
||||
else
|
||||
API_VERSION=$(echo "$API_VERSION_RAW" | jq -r '.')
|
||||
fi
|
||||
|
||||
if [ "$API_VERSION" != "$EXPECTED_VERSION" ]; then
|
||||
echo " ✗ FAILED: Probe version mismatch"
|
||||
echo " Terraform state: $EXPECTED_VERSION"
|
||||
echo " API (unwrapped): $API_VERSION"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✓ probe_version matches: $API_VERSION"
|
||||
|
||||
echo " ✓ All probe_version idempotency tests passed"
|
||||
echo " The READ operation correctly unwraps Version wrapper objects"
|
||||
@@ -1355,6 +1355,9 @@ func (r *${resourceTypeName}Resource) Delete(ctx context.Context, req resource.D
|
||||
// Check if it's a wrapper object with value field (e.g., Version, DateTime types)
|
||||
if innerVal, ok := val["value"].(string); ok {
|
||||
${fieldName} = types.StringValue(innerVal)
|
||||
} else if innerVal, ok := val["value"].(float64); ok {
|
||||
// Handle numeric values that might be returned as float64
|
||||
${fieldName} = types.StringValue(fmt.Sprintf("%v", innerVal))
|
||||
} else if jsonBytes, err := json.Marshal(val); err == nil {
|
||||
${fieldName} = types.StringValue(string(jsonBytes))
|
||||
} else {
|
||||
@@ -1366,12 +1369,25 @@ func (r *${resourceTypeName}Resource) Delete(ctx context.Context, req resource.D
|
||||
${fieldName} = types.StringNull()
|
||||
}`;
|
||||
}
|
||||
/*
|
||||
* Default string handling - also unwrap wrapper objects for consistency
|
||||
* This ensures that even if isComplexObject is not set correctly,
|
||||
* wrapper objects like {"_type":"Version","value":"1.0.0"} are still properly unwrapped
|
||||
* This fixes the READ operation drift issue where API returns wrapped format
|
||||
*/
|
||||
return `if obj, ok := ${responseValue}.(map[string]interface{}); ok {
|
||||
// Handle ObjectID type responses
|
||||
// Handle ObjectID type responses and wrapper objects (e.g., Version, DateTime, Name types)
|
||||
if val, ok := obj["_id"].(string); ok && val != "" {
|
||||
${fieldName} = types.StringValue(val)
|
||||
} else if val, ok := obj["value"].(string); ok && val != "" {
|
||||
} else if val, ok := obj["value"].(string); ok {
|
||||
// Unwrap wrapper objects - extract the inner value regardless of whether it's empty
|
||||
${fieldName} = types.StringValue(val)
|
||||
} else if val, ok := obj["value"].(float64); ok {
|
||||
// Handle numeric values that might be returned as float64
|
||||
${fieldName} = types.StringValue(fmt.Sprintf("%v", val))
|
||||
} else if jsonBytes, err := json.Marshal(obj); err == nil {
|
||||
// Fallback to JSON marshaling for other complex objects
|
||||
${fieldName} = types.StringValue(string(jsonBytes))
|
||||
} else {
|
||||
${fieldName} = types.StringNull()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user