mirror of
https://github.com/OneUptime/oneuptime.git
synced 2026-04-06 00:32:12 +02:00
feat: Implement Alert Suppression UI and Migration Plan
- Added UI components and pages for Alert Suppression including: - Suppression Rules List Page - Create/Edit Suppression Rule Page - Suppressed Alerts Log Page - Maintenance Windows Calendar View - Active Maintenance Banner - Quick Maintenance Modal - Created migration scripts for new database tables: - AlertSuppressionGroup - AlertSuppressionRule - SuppressedAlertLog - AlertThrottleState - Defined rollout strategy and data retention policies for suppressed alerts - Updated README with implementation plan and architecture diagram
This commit is contained in:
548
Docs/Plan/AlertDeduplication/1-DataModels.md
Normal file
548
Docs/Plan/AlertDeduplication/1-DataModels.md
Normal file
@@ -0,0 +1,548 @@
|
||||
# Data Models for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the database models required for Alert Deduplication functionality.
|
||||
|
||||
## Entity Relationship Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────┐
|
||||
│ AlertFingerprint │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ fingerprint (hash) │◄──────┐
|
||||
│ fingerprintFields │ │
|
||||
│ canonicalAlertId │───────┼──► Alert
|
||||
│ duplicateCount │ │
|
||||
│ windowStartAt │ │
|
||||
│ windowEndAt │ │
|
||||
└─────────────────────────┘ │
|
||||
│
|
||||
┌─────────────────────────────────┴───────────────────────────────────┐
|
||||
│ Alert (existing) │
|
||||
├─────────────────────────────────────────────────────────────────────┤
|
||||
│ + fingerprint (NEW) - SHA-256 hash of alert │
|
||||
│ + duplicateCount (NEW) - Number of duplicates suppressed │
|
||||
│ + lastDuplicateAt (NEW) - When last duplicate occurred │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Definitions
|
||||
|
||||
### 1. AlertFingerprint
|
||||
|
||||
Cache of active fingerprints for deduplication lookups.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/AlertFingerprint.ts`
|
||||
|
||||
```typescript
|
||||
import {
|
||||
Column,
|
||||
Entity,
|
||||
Index,
|
||||
JoinColumn,
|
||||
ManyToOne,
|
||||
} from 'typeorm';
|
||||
import BaseModel from './DatabaseBaseModel/DatabaseBaseModel';
|
||||
import Project from './Project';
|
||||
import Alert from './Alert';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import ColumnType from 'Common/Types/Database/ColumnType';
|
||||
import TableColumnType from 'Common/Types/Database/TableColumnType';
|
||||
import Permission from 'Common/Types/Permission';
|
||||
import IconProp from 'Common/Types/Icon/IconProp';
|
||||
|
||||
@TableMetadata({
|
||||
tableName: 'AlertFingerprint',
|
||||
singularName: 'Alert Fingerprint',
|
||||
pluralName: 'Alert Fingerprints',
|
||||
icon: IconProp.Key,
|
||||
tableDescription: 'Stores fingerprints for alert deduplication',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertFingerprint',
|
||||
})
|
||||
export default class AlertFingerprint extends BaseModel {
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// PROJECT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Entity,
|
||||
modelType: Project,
|
||||
title: 'Project',
|
||||
})
|
||||
@ManyToOne(() => Project, {
|
||||
onDelete: 'CASCADE',
|
||||
orphanedRowAction: 'delete',
|
||||
})
|
||||
@JoinColumn({ name: 'projectId' })
|
||||
public project?: Project = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// FINGERPRINT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Fingerprint',
|
||||
description: 'SHA-256 hash of the alert fields',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 64, // SHA-256 hex length
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public fingerprint?: string = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Fingerprint Fields',
|
||||
description: 'Fields used to compute this fingerprint',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: false,
|
||||
})
|
||||
public fingerprintFields?: Array<string> = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// CANONICAL ALERT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Entity,
|
||||
modelType: Alert,
|
||||
title: 'Canonical Alert',
|
||||
description: 'The original alert this fingerprint refers to',
|
||||
})
|
||||
@ManyToOne(() => Alert, {
|
||||
onDelete: 'CASCADE',
|
||||
orphanedRowAction: 'delete',
|
||||
})
|
||||
@JoinColumn({ name: 'canonicalAlertId' })
|
||||
public canonicalAlert?: Alert = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Canonical Alert ID',
|
||||
description: 'ID of the original alert',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public canonicalAlertId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DUPLICATE TRACKING
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Duplicate Count',
|
||||
description: 'Number of duplicate alerts suppressed',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public duplicateCount?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Last Duplicate At',
|
||||
description: 'When the last duplicate was received',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: true,
|
||||
})
|
||||
public lastDuplicateAt?: Date = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// TIME WINDOW
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Window Start',
|
||||
description: 'When this deduplication window started',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
public windowStartAt?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Window End',
|
||||
description: 'When this deduplication window expires',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public windowEndAt?: Date = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Alert Model Enhancements
|
||||
|
||||
Add deduplication fields to existing Alert model.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/Alert.ts` (modifications)
|
||||
|
||||
```typescript
|
||||
// Add these fields to the existing Alert model:
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DEDUPLICATION FIELDS (NEW)
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Fingerprint',
|
||||
description: 'SHA-256 fingerprint hash for deduplication',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 64,
|
||||
nullable: true,
|
||||
})
|
||||
@Index()
|
||||
public fingerprint?: string = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Duplicate Count',
|
||||
description: 'Number of duplicate alerts that were suppressed',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public duplicateCount?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Last Duplicate At',
|
||||
description: 'When the last duplicate occurred',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: true,
|
||||
})
|
||||
public lastDuplicateAt?: Date = undefined;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. DeduplicationConfig (Project Settings)
|
||||
|
||||
Add deduplication settings to Project or create separate settings model.
|
||||
|
||||
**Option A: Add to Project model**
|
||||
|
||||
```typescript
|
||||
// In Project model, add:
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Deduplication Config',
|
||||
description: 'Alert deduplication settings for this project',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: true,
|
||||
})
|
||||
public alertDeduplicationConfig?: DeduplicationConfig = undefined;
|
||||
```
|
||||
|
||||
**Option B: Separate AlertDeduplicationConfig model**
|
||||
|
||||
```typescript
|
||||
@TableMetadata({
|
||||
tableName: 'AlertDeduplicationConfig',
|
||||
singularName: 'Deduplication Config',
|
||||
pluralName: 'Deduplication Configs',
|
||||
icon: IconProp.Settings,
|
||||
tableDescription: 'Project-level deduplication settings',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertDeduplicationConfig',
|
||||
})
|
||||
export default class AlertDeduplicationConfig extends BaseModel {
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index({ unique: true })
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Boolean,
|
||||
title: 'Enabled',
|
||||
description: 'Whether deduplication is enabled',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Boolean,
|
||||
nullable: false,
|
||||
default: true,
|
||||
})
|
||||
public enabled?: boolean = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Window Minutes',
|
||||
description: 'Time window for deduplication (minutes)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 60,
|
||||
})
|
||||
public windowMinutes?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Fingerprint Fields',
|
||||
description: 'Fields to include in fingerprint',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: false,
|
||||
default: "['monitorId', 'createdCriteriaId', 'alertSeverityId', 'title']",
|
||||
})
|
||||
public fingerprintFields?: Array<string> = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Boolean,
|
||||
title: 'Normalize Strings',
|
||||
description: 'Whether to normalize strings (lowercase, trim)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Boolean,
|
||||
nullable: false,
|
||||
default: true,
|
||||
})
|
||||
public normalizeStrings?: boolean = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Type Definitions
|
||||
|
||||
```typescript
|
||||
// /Common/Types/Alert/DeduplicationConfig.ts
|
||||
|
||||
export interface DeduplicationConfig {
|
||||
// Enable/disable deduplication
|
||||
enabled: boolean;
|
||||
|
||||
// Time window for deduplication (minutes)
|
||||
windowMinutes: number;
|
||||
|
||||
// Fields to include in fingerprint
|
||||
fingerprintFields: Array<string>;
|
||||
|
||||
// Whether to normalize strings (lowercase, trim)
|
||||
normalizeStrings: boolean;
|
||||
}
|
||||
|
||||
export const DEFAULT_DEDUPLICATION_CONFIG: DeduplicationConfig = {
|
||||
enabled: true,
|
||||
windowMinutes: 60,
|
||||
fingerprintFields: ['monitorId', 'createdCriteriaId', 'alertSeverityId', 'title'],
|
||||
normalizeStrings: true,
|
||||
};
|
||||
|
||||
export const AVAILABLE_FINGERPRINT_FIELDS: Array<{
|
||||
field: string;
|
||||
label: string;
|
||||
description: string;
|
||||
}> = [
|
||||
{
|
||||
field: 'monitorId',
|
||||
label: 'Monitor',
|
||||
description: 'Include monitor in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'createdCriteriaId',
|
||||
label: 'Criteria',
|
||||
description: 'Include alert criteria in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'alertSeverityId',
|
||||
label: 'Severity',
|
||||
description: 'Include severity in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'title',
|
||||
label: 'Title',
|
||||
description: 'Include alert title in fingerprint',
|
||||
},
|
||||
{
|
||||
field: 'description',
|
||||
label: 'Description',
|
||||
description: 'Include alert description in fingerprint',
|
||||
},
|
||||
];
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Indexes
|
||||
|
||||
```sql
|
||||
-- AlertFingerprint indexes
|
||||
CREATE INDEX idx_fingerprint_lookup
|
||||
ON "AlertFingerprint" ("projectId", "fingerprint", "windowEndAt");
|
||||
|
||||
CREATE INDEX idx_fingerprint_cleanup
|
||||
ON "AlertFingerprint" ("windowEndAt");
|
||||
|
||||
CREATE INDEX idx_fingerprint_alert
|
||||
ON "AlertFingerprint" ("canonicalAlertId");
|
||||
|
||||
-- Alert fingerprint index
|
||||
CREATE INDEX idx_alert_fingerprint
|
||||
ON "Alert" ("projectId", "fingerprint")
|
||||
WHERE "fingerprint" IS NOT NULL;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
- [ ] Create AlertFingerprint model
|
||||
- [ ] Add fingerprint fields to Alert model
|
||||
- [ ] Create DeduplicationConfig type
|
||||
- [ ] Add config to Project model (or create separate model)
|
||||
- [ ] Register models in model registry
|
||||
- [ ] Create database migrations
|
||||
- [ ] Add indexes
|
||||
- [ ] Update API permissions
|
||||
667
Docs/Plan/AlertDeduplication/2-Backend.md
Normal file
667
Docs/Plan/AlertDeduplication/2-Backend.md
Normal file
@@ -0,0 +1,667 @@
|
||||
# Backend Implementation for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the backend services and components required for Alert Deduplication functionality.
|
||||
|
||||
## Core Components
|
||||
|
||||
### 1. FingerprintGenerator
|
||||
|
||||
Generates unique fingerprints for alerts based on configurable fields.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/FingerprintGenerator.ts`
|
||||
|
||||
```typescript
|
||||
import Alert from '../../Models/DatabaseModels/Alert';
|
||||
import crypto from 'crypto';
|
||||
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
|
||||
|
||||
export default class FingerprintGenerator {
|
||||
/**
|
||||
* Default fields used for fingerprinting
|
||||
*/
|
||||
public static DEFAULT_FIELDS: Array<string> = [
|
||||
'monitorId',
|
||||
'createdCriteriaId',
|
||||
'alertSeverityId',
|
||||
'title',
|
||||
];
|
||||
|
||||
/**
|
||||
* Generate a fingerprint hash for an alert
|
||||
*/
|
||||
public static generate(
|
||||
alert: Partial<Alert>,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): string {
|
||||
const fields = config?.fingerprintFields || this.DEFAULT_FIELDS;
|
||||
const normalizeStrings = config?.normalizeStrings ?? true;
|
||||
|
||||
const values: Array<string> = [];
|
||||
|
||||
for (const field of fields) {
|
||||
let value = this.getFieldValue(alert, field);
|
||||
|
||||
if (normalizeStrings && typeof value === 'string') {
|
||||
value = value.toLowerCase().trim();
|
||||
}
|
||||
|
||||
values.push(`${field}:${value}`);
|
||||
}
|
||||
|
||||
const fingerprintInput = values.join('|');
|
||||
|
||||
return crypto
|
||||
.createHash('sha256')
|
||||
.update(fingerprintInput)
|
||||
.digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a field value from an alert object
|
||||
*/
|
||||
private static getFieldValue(alert: Partial<Alert>, field: string): string {
|
||||
switch (field) {
|
||||
case 'monitorId':
|
||||
return alert.monitorId?.toString() || '';
|
||||
|
||||
case 'createdCriteriaId':
|
||||
return alert.createdCriteriaId || '';
|
||||
|
||||
case 'alertSeverityId':
|
||||
case 'severity':
|
||||
return alert.alertSeverityId?.toString() || '';
|
||||
|
||||
case 'title':
|
||||
return alert.title || '';
|
||||
|
||||
case 'description':
|
||||
return alert.description || '';
|
||||
|
||||
case 'createdByProbeId':
|
||||
return alert.createdByProbeId?.toString() || '';
|
||||
|
||||
default:
|
||||
// Try to get from customFields
|
||||
if (alert.customFields && typeof alert.customFields === 'object') {
|
||||
const customValue = (alert.customFields as Record<string, unknown>)[field];
|
||||
return customValue?.toString() || '';
|
||||
}
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that all required fields are present for fingerprinting
|
||||
*/
|
||||
public static validateFields(
|
||||
alert: Partial<Alert>,
|
||||
fields: Array<string>
|
||||
): { valid: boolean; missingFields: Array<string> } {
|
||||
const missingFields: Array<string> = [];
|
||||
|
||||
for (const field of fields) {
|
||||
const value = this.getFieldValue(alert, field);
|
||||
if (!value) {
|
||||
missingFields.push(field);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
valid: missingFields.length === 0,
|
||||
missingFields,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two fingerprints
|
||||
*/
|
||||
public static areEqual(fingerprint1: string, fingerprint2: string): boolean {
|
||||
return fingerprint1 === fingerprint2;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. DeduplicationEngine
|
||||
|
||||
Handles the core deduplication logic.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/DeduplicationEngine.ts`
|
||||
|
||||
```typescript
|
||||
import Alert from '../../Models/DatabaseModels/Alert';
|
||||
import AlertFingerprint from '../../Models/DatabaseModels/AlertFingerprint';
|
||||
import AlertFingerprintService from '../../Services/AlertFingerprintService';
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import FingerprintGenerator from './FingerprintGenerator';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
|
||||
|
||||
export interface DeduplicationResult {
|
||||
isDuplicate: boolean;
|
||||
canonicalAlertId?: ObjectID;
|
||||
canonicalAlert?: Alert;
|
||||
duplicateCount?: number;
|
||||
fingerprint: string;
|
||||
}
|
||||
|
||||
export default class DeduplicationEngine {
|
||||
/**
|
||||
* Check if an alert is a duplicate of an existing alert
|
||||
*/
|
||||
public static async checkDuplicate(
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): Promise<DeduplicationResult> {
|
||||
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
|
||||
|
||||
// Generate fingerprint
|
||||
const fingerprint = FingerprintGenerator.generate(alertData, mergedConfig);
|
||||
|
||||
// Check if fingerprint exists in active window
|
||||
const existingFingerprint = await AlertFingerprintService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
fingerprint,
|
||||
windowEndAt: QueryHelper.greaterThan(new Date()),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
canonicalAlertId: true,
|
||||
duplicateCount: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (existingFingerprint) {
|
||||
// It's a duplicate - update counters
|
||||
const newDuplicateCount = (existingFingerprint.duplicateCount || 0) + 1;
|
||||
|
||||
await AlertFingerprintService.updateOneById({
|
||||
id: existingFingerprint.id!,
|
||||
data: {
|
||||
duplicateCount: newDuplicateCount,
|
||||
lastDuplicateAt: new Date(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Update the canonical alert's duplicate count
|
||||
await AlertService.updateOneById({
|
||||
id: existingFingerprint.canonicalAlertId!,
|
||||
data: {
|
||||
duplicateCount: newDuplicateCount,
|
||||
lastDuplicateAt: new Date(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Get the canonical alert for return
|
||||
const canonicalAlert = await AlertService.findOneById({
|
||||
id: existingFingerprint.canonicalAlertId!,
|
||||
select: {
|
||||
_id: true,
|
||||
title: true,
|
||||
alertNumber: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return {
|
||||
isDuplicate: true,
|
||||
canonicalAlertId: existingFingerprint.canonicalAlertId,
|
||||
canonicalAlert: canonicalAlert || undefined,
|
||||
duplicateCount: newDuplicateCount,
|
||||
fingerprint,
|
||||
};
|
||||
}
|
||||
|
||||
// Not a duplicate
|
||||
return {
|
||||
isDuplicate: false,
|
||||
fingerprint,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a new fingerprint for an alert
|
||||
*/
|
||||
public static async registerFingerprint(
|
||||
alert: Alert,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): Promise<AlertFingerprint> {
|
||||
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
|
||||
|
||||
const fingerprint = alert.fingerprint ||
|
||||
FingerprintGenerator.generate(alert, mergedConfig);
|
||||
|
||||
const now = new Date();
|
||||
const windowEnd = OneUptimeDate.addRemoveMinutes(
|
||||
now,
|
||||
mergedConfig.windowMinutes
|
||||
);
|
||||
|
||||
const fingerprintRecord = await AlertFingerprintService.create({
|
||||
data: {
|
||||
projectId: alert.projectId,
|
||||
fingerprint,
|
||||
fingerprintFields: mergedConfig.fingerprintFields,
|
||||
canonicalAlertId: alert.id,
|
||||
duplicateCount: 0,
|
||||
windowStartAt: now,
|
||||
windowEndAt: windowEnd,
|
||||
} as AlertFingerprint,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return fingerprintRecord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an alert through deduplication
|
||||
* Returns the alert to create (or null if duplicate)
|
||||
*/
|
||||
public static async processAlert(
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID,
|
||||
config?: Partial<DeduplicationConfig>
|
||||
): Promise<{
|
||||
shouldCreate: boolean;
|
||||
alertData: Partial<Alert>;
|
||||
deduplicationResult: DeduplicationResult;
|
||||
}> {
|
||||
const mergedConfig = { ...DEFAULT_DEDUPLICATION_CONFIG, ...config };
|
||||
|
||||
// Skip deduplication if disabled
|
||||
if (!mergedConfig.enabled) {
|
||||
const fingerprint = FingerprintGenerator.generate(alertData, mergedConfig);
|
||||
return {
|
||||
shouldCreate: true,
|
||||
alertData: { ...alertData, fingerprint },
|
||||
deduplicationResult: {
|
||||
isDuplicate: false,
|
||||
fingerprint,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Check for duplicate
|
||||
const result = await this.checkDuplicate(alertData, projectId, mergedConfig);
|
||||
|
||||
if (result.isDuplicate) {
|
||||
return {
|
||||
shouldCreate: false,
|
||||
alertData,
|
||||
deduplicationResult: result,
|
||||
};
|
||||
}
|
||||
|
||||
// Not a duplicate - add fingerprint to alert data
|
||||
return {
|
||||
shouldCreate: true,
|
||||
alertData: { ...alertData, fingerprint: result.fingerprint },
|
||||
deduplicationResult: result,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get deduplication statistics for a project
|
||||
*/
|
||||
public static async getStatistics(
|
||||
projectId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<{
|
||||
totalAlerts: number;
|
||||
uniqueAlerts: number;
|
||||
duplicateCount: number;
|
||||
deduplicationRate: number;
|
||||
}> {
|
||||
// Count total fingerprint records
|
||||
const fingerprints = await AlertFingerprintService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
windowStartAt: QueryHelper.between(startDate, endDate),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
duplicateCount: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const uniqueAlerts = fingerprints.length;
|
||||
const duplicateCount = fingerprints.reduce(
|
||||
(sum, fp) => sum + (fp.duplicateCount || 0),
|
||||
0
|
||||
);
|
||||
const totalAlerts = uniqueAlerts + duplicateCount;
|
||||
const deduplicationRate = totalAlerts > 0
|
||||
? (duplicateCount / totalAlerts) * 100
|
||||
: 0;
|
||||
|
||||
return {
|
||||
totalAlerts,
|
||||
uniqueAlerts,
|
||||
duplicateCount,
|
||||
deduplicationRate: Math.round(deduplicationRate * 100) / 100,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. AlertFingerprintService
|
||||
|
||||
Database service for AlertFingerprint model.
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertFingerprintService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import AlertFingerprint from '../Models/DatabaseModels/AlertFingerprint';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import QueryHelper from '../Types/Database/QueryHelper';
|
||||
|
||||
export class Service extends DatabaseService<AlertFingerprint> {
|
||||
public constructor() {
|
||||
super(AlertFingerprint);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up expired fingerprints
|
||||
*/
|
||||
public async cleanupExpired(): Promise<number> {
|
||||
const result = await this.deleteBy({
|
||||
query: {
|
||||
windowEndAt: QueryHelper.lessThan(new Date()),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active fingerprints for a project
|
||||
*/
|
||||
public async getActiveFingerprints(
|
||||
projectId: ObjectID
|
||||
): Promise<Array<AlertFingerprint>> {
|
||||
return await this.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
windowEndAt: QueryHelper.greaterThan(new Date()),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
fingerprint: true,
|
||||
canonicalAlertId: true,
|
||||
duplicateCount: true,
|
||||
windowEndAt: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the window for a fingerprint (if alert is still active)
|
||||
*/
|
||||
public async extendWindow(
|
||||
fingerprintId: ObjectID,
|
||||
newEndTime: Date
|
||||
): Promise<void> {
|
||||
await this.updateOneById({
|
||||
id: fingerprintId,
|
||||
data: {
|
||||
windowEndAt: newEndTime,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Integration with AlertService
|
||||
|
||||
Modify AlertService to use deduplication.
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertService.ts` (modifications)
|
||||
|
||||
```typescript
|
||||
import DeduplicationEngine from '../Utils/Alert/DeduplicationEngine';
|
||||
import { DeduplicationConfig, DEFAULT_DEDUPLICATION_CONFIG } from 'Common/Types/Alert/DeduplicationConfig';
|
||||
|
||||
// In onBeforeCreate():
|
||||
protected async onBeforeCreate(
|
||||
createBy: CreateBy<Alert>
|
||||
): Promise<OnCreate<Alert>> {
|
||||
// ... existing code ...
|
||||
|
||||
// Get deduplication config for project
|
||||
const deduplicationConfig = await this.getDeduplicationConfig(
|
||||
createBy.data.projectId!
|
||||
);
|
||||
|
||||
// Process through deduplication engine
|
||||
const deduplicationResult = await DeduplicationEngine.processAlert(
|
||||
createBy.data,
|
||||
createBy.data.projectId!,
|
||||
deduplicationConfig
|
||||
);
|
||||
|
||||
if (!deduplicationResult.shouldCreate) {
|
||||
// This is a duplicate - don't create
|
||||
throw new DuplicateAlertException(
|
||||
`Duplicate of alert #${deduplicationResult.deduplicationResult.canonicalAlert?.alertNumber}`,
|
||||
deduplicationResult.deduplicationResult.canonicalAlertId!
|
||||
);
|
||||
}
|
||||
|
||||
// Add fingerprint to alert data
|
||||
createBy.data.fingerprint = deduplicationResult.alertData.fingerprint;
|
||||
|
||||
// ... rest of existing code ...
|
||||
}
|
||||
|
||||
// In onCreateSuccess():
|
||||
protected async onCreateSuccess(
|
||||
onCreate: OnCreate<Alert>,
|
||||
createdItem: Alert
|
||||
): Promise<Alert> {
|
||||
// ... existing code ...
|
||||
|
||||
// Register fingerprint for deduplication
|
||||
const deduplicationConfig = await this.getDeduplicationConfig(
|
||||
createdItem.projectId!
|
||||
);
|
||||
|
||||
if (deduplicationConfig.enabled) {
|
||||
await DeduplicationEngine.registerFingerprint(
|
||||
createdItem,
|
||||
deduplicationConfig
|
||||
);
|
||||
}
|
||||
|
||||
// ... rest of existing code ...
|
||||
}
|
||||
|
||||
// Helper method:
|
||||
private async getDeduplicationConfig(
|
||||
projectId: ObjectID
|
||||
): Promise<DeduplicationConfig> {
|
||||
const project = await ProjectService.findOneById({
|
||||
id: projectId,
|
||||
select: { alertDeduplicationConfig: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return project?.alertDeduplicationConfig || DEFAULT_DEDUPLICATION_CONFIG;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. DuplicateAlertException
|
||||
|
||||
Custom exception for duplicate alerts.
|
||||
|
||||
**File Location:** `/Common/Types/Exception/DuplicateAlertException.ts`
|
||||
|
||||
```typescript
|
||||
import Exception from './Exception';
|
||||
import ExceptionCode from './ExceptionCode';
|
||||
import ObjectID from '../ObjectID';
|
||||
|
||||
export default class DuplicateAlertException extends Exception {
|
||||
public canonicalAlertId: ObjectID;
|
||||
|
||||
public constructor(message: string, canonicalAlertId: ObjectID) {
|
||||
super(ExceptionCode.DuplicateAlertException, message);
|
||||
this.canonicalAlertId = canonicalAlertId;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Worker Jobs
|
||||
|
||||
### 1. FingerprintCleanup Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertDeduplication/FingerprintCleanup.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_HOUR } from 'Common/Utils/CronTime';
|
||||
import AlertFingerprintService from 'Common/Server/Services/AlertFingerprintService';
|
||||
|
||||
RunCron(
|
||||
'AlertDeduplication:FingerprintCleanup',
|
||||
{ schedule: EVERY_HOUR, runOnStartup: false },
|
||||
async () => {
|
||||
const deletedCount = await AlertFingerprintService.cleanupExpired();
|
||||
|
||||
if (deletedCount > 0) {
|
||||
logger.info(`Cleaned up ${deletedCount} expired fingerprints`);
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Redis Caching (Optional Enhancement)
|
||||
|
||||
For high-throughput systems, cache fingerprints in Redis.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/FingerprintCache.ts`
|
||||
|
||||
```typescript
|
||||
import Redis from '../../Infrastructure/Redis';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
|
||||
export default class FingerprintCache {
|
||||
private static CACHE_PREFIX = 'alert:fingerprint:';
|
||||
private static DEFAULT_TTL_SECONDS = 3600; // 1 hour
|
||||
|
||||
/**
|
||||
* Get a cached fingerprint
|
||||
*/
|
||||
public static async get(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string
|
||||
): Promise<{ canonicalAlertId: string; duplicateCount: number } | null> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
const value = await Redis.get(key);
|
||||
|
||||
if (!value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return JSON.parse(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a fingerprint in cache
|
||||
*/
|
||||
public static async set(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string,
|
||||
data: { canonicalAlertId: string; duplicateCount: number },
|
||||
ttlSeconds: number = this.DEFAULT_TTL_SECONDS
|
||||
): Promise<void> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
await Redis.setex(key, ttlSeconds, JSON.stringify(data));
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment duplicate count in cache
|
||||
*/
|
||||
public static async incrementDuplicateCount(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string
|
||||
): Promise<number> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
const countKey = `${key}:count`;
|
||||
return await Redis.incr(countKey);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a fingerprint from cache
|
||||
*/
|
||||
public static async delete(
|
||||
projectId: ObjectID,
|
||||
fingerprint: string
|
||||
): Promise<void> {
|
||||
const key = this.buildKey(projectId, fingerprint);
|
||||
await Redis.del(key);
|
||||
}
|
||||
|
||||
private static buildKey(projectId: ObjectID, fingerprint: string): string {
|
||||
return `${this.CACHE_PREFIX}${projectId.toString()}:${fingerprint}`;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Core Components
|
||||
- [ ] Create FingerprintGenerator utility
|
||||
- [ ] Create DeduplicationEngine
|
||||
- [ ] Create AlertFingerprintService
|
||||
- [ ] Create DuplicateAlertException
|
||||
|
||||
### Phase 2: Integration
|
||||
- [ ] Modify AlertService.onBeforeCreate()
|
||||
- [ ] Modify AlertService.onCreateSuccess()
|
||||
- [ ] Add fingerprint fields to Alert model
|
||||
- [ ] Create AlertFingerprint model
|
||||
|
||||
### Phase 3: Background Jobs
|
||||
- [ ] Create FingerprintCleanup job
|
||||
- [ ] Register job in worker
|
||||
|
||||
### Phase 4: Testing
|
||||
- [ ] Unit tests for FingerprintGenerator
|
||||
- [ ] Unit tests for DeduplicationEngine
|
||||
- [ ] Integration tests for deduplication flow
|
||||
- [ ] Performance tests for high-volume scenarios
|
||||
|
||||
### Phase 5: Optional Enhancements
|
||||
- [ ] Redis caching for fingerprints
|
||||
- [ ] Configurable fingerprint fields per project
|
||||
- [ ] Deduplication analytics API
|
||||
287
Docs/Plan/AlertDeduplication/3-API.md
Normal file
287
Docs/Plan/AlertDeduplication/3-API.md
Normal file
@@ -0,0 +1,287 @@
|
||||
# API Design for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Deduplication functionality.
|
||||
|
||||
## Deduplication Configuration API
|
||||
|
||||
### Get Deduplication Config
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-deduplication-config
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"windowMinutes": 60,
|
||||
"fingerprintFields": ["monitorId", "createdCriteriaId", "alertSeverityId", "title"],
|
||||
"normalizeStrings": true
|
||||
}
|
||||
```
|
||||
|
||||
### Update Deduplication Config
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-deduplication-config
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"windowMinutes": 120,
|
||||
"fingerprintFields": ["monitorId", "alertSeverityId", "title"],
|
||||
"normalizeStrings": true
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deduplication Statistics API
|
||||
|
||||
### Get Deduplication Statistics
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-deduplication-stats
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"period": {
|
||||
"startDate": "2026-01-13T00:00:00Z",
|
||||
"endDate": "2026-01-20T00:00:00Z"
|
||||
},
|
||||
"totalAlerts": 5000,
|
||||
"uniqueAlerts": 2500,
|
||||
"duplicateCount": 2500,
|
||||
"deduplicationRate": 50.0,
|
||||
"topDuplicatedAlerts": [
|
||||
{
|
||||
"alertId": "alert-1",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"duplicateCount": 150,
|
||||
"monitor": { "name": "mysql-prod" }
|
||||
},
|
||||
{
|
||||
"alertId": "alert-2",
|
||||
"alertTitle": "API latency high",
|
||||
"duplicateCount": 89,
|
||||
"monitor": { "name": "api-gateway" }
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Fingerprint API
|
||||
|
||||
### List Active Fingerprints
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-fingerprint
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "fingerprint-1",
|
||||
"fingerprint": "a1b2c3d4...",
|
||||
"canonicalAlert": {
|
||||
"_id": "alert-1",
|
||||
"alertNumber": 123,
|
||||
"title": "MySQL connection timeout"
|
||||
},
|
||||
"duplicateCount": 15,
|
||||
"lastDuplicateAt": "2026-01-20T10:45:00Z",
|
||||
"windowEndAt": "2026-01-20T11:00:00Z"
|
||||
}
|
||||
],
|
||||
"count": 50
|
||||
}
|
||||
```
|
||||
|
||||
### Get Fingerprint Details
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-fingerprint/{fingerprintId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Response Enhancement
|
||||
|
||||
The Alert response now includes deduplication fields:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "alert-1",
|
||||
"alertNumber": 123,
|
||||
"title": "MySQL connection timeout",
|
||||
"fingerprint": "a1b2c3d4e5f6...",
|
||||
"duplicateCount": 15,
|
||||
"lastDuplicateAt": "2026-01-20T10:45:00Z",
|
||||
"// ... other fields"
|
||||
}
|
||||
```
|
||||
|
||||
### Filter Alerts by Duplicate Count
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert?duplicateCount.gt=10
|
||||
```
|
||||
|
||||
Get alerts with more than 10 duplicates.
|
||||
|
||||
---
|
||||
|
||||
## Available Fingerprint Fields API
|
||||
|
||||
### Get Available Fields
|
||||
|
||||
```http
|
||||
GET /api/alert-deduplication-config/available-fields
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"field": "monitorId",
|
||||
"label": "Monitor",
|
||||
"description": "Include monitor in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "createdCriteriaId",
|
||||
"label": "Criteria",
|
||||
"description": "Include alert criteria in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "alertSeverityId",
|
||||
"label": "Severity",
|
||||
"description": "Include severity in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "title",
|
||||
"label": "Title",
|
||||
"description": "Include alert title in fingerprint"
|
||||
},
|
||||
{
|
||||
"field": "description",
|
||||
"label": "Description",
|
||||
"description": "Include alert description in fingerprint"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Test Fingerprint API
|
||||
|
||||
### Generate Test Fingerprint
|
||||
|
||||
Test what fingerprint would be generated for given alert data.
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-deduplication-config/test
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertData": {
|
||||
"monitorId": "monitor-1",
|
||||
"alertSeverityId": "severity-1",
|
||||
"title": "MySQL connection timeout"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"fingerprint": "a1b2c3d4e5f6...",
|
||||
"fieldsUsed": ["monitorId", "alertSeverityId", "title"],
|
||||
"fieldValues": {
|
||||
"monitorId": "monitor-1",
|
||||
"alertSeverityId": "severity-1",
|
||||
"title": "mysql connection timeout"
|
||||
},
|
||||
"wouldBeDuplicateOf": {
|
||||
"alertId": "alert-123",
|
||||
"alertNumber": 123,
|
||||
"alertTitle": "MySQL connection timeout"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "DUPLICATE_ALERT",
|
||||
"message": "Duplicate of alert #123",
|
||||
"data": {
|
||||
"canonicalAlertId": "alert-123",
|
||||
"canonicalAlertNumber": 123,
|
||||
"duplicateCount": 16
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: This is typically not shown to users as duplicates are handled silently.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Configuration API
|
||||
- [ ] GET /alert-deduplication-config
|
||||
- [ ] PUT /alert-deduplication-config
|
||||
- [ ] GET /alert-deduplication-config/available-fields
|
||||
- [ ] POST /alert-deduplication-config/test
|
||||
|
||||
### Statistics API
|
||||
- [ ] GET /alert-deduplication-stats
|
||||
|
||||
### Fingerprint API
|
||||
- [ ] GET /alert-fingerprint (list)
|
||||
- [ ] GET /alert-fingerprint/:id (details)
|
||||
|
||||
### Alert API Updates
|
||||
- [ ] Add fingerprint to response
|
||||
- [ ] Add duplicateCount to response
|
||||
- [ ] Add duplicateCount filter
|
||||
259
Docs/Plan/AlertDeduplication/4-UI.md
Normal file
259
Docs/Plan/AlertDeduplication/4-UI.md
Normal file
@@ -0,0 +1,259 @@
|
||||
# UI Implementation for Alert Deduplication
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Deduplication functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
└── Settings
|
||||
└── Alerts
|
||||
├── Alert States (existing)
|
||||
├── Alert Severities (existing)
|
||||
├── Grouping Rules
|
||||
├── Suppression Rules
|
||||
└── Deduplication (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Deduplication Settings Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertDeduplication.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-deduplication`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Alert Deduplication │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Deduplication prevents duplicate alerts from being created. When a duplicate │ │
|
||||
│ │ alert is detected, it increments the count on the original alert instead. │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ DEDUPLICATION STATUS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Deduplication is ENABLED [Disable] │ │
|
||||
│ └──────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ STATISTICS (Last 7 Days) │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
|
||||
│ │ Total Alerts │ │ Unique Alerts │ │ Deduplicated │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ 5,000 │ │ 2,500 │ │ 2,500 │ │
|
||||
│ │ │ │ │ │ (50%) │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
|
||||
│ │
|
||||
│ CONFIGURATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Deduplication Window │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 60 │ minutes │
|
||||
│ └──────────┘ │
|
||||
│ Alerts with the same fingerprint within this window are considered duplicates. │
|
||||
│ │
|
||||
│ Fingerprint Fields │
|
||||
│ Select which fields to include when computing the alert fingerprint: │
|
||||
│ │
|
||||
│ ☑ Monitor - Include monitor in fingerprint │
|
||||
│ ☑ Criteria - Include alert criteria in fingerprint │
|
||||
│ ☑ Severity - Include severity level in fingerprint │
|
||||
│ ☑ Title - Include alert title in fingerprint │
|
||||
│ ☐ Description - Include alert description in fingerprint │
|
||||
│ │
|
||||
│ String Normalization │
|
||||
│ ☑ Normalize strings (convert to lowercase, trim whitespace) │
|
||||
│ │
|
||||
│ [Save Changes] │
|
||||
│ │
|
||||
│ TEST FINGERPRINT │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Test what fingerprint would be generated for an alert: │
|
||||
│ │
|
||||
│ Monitor Severity Title │
|
||||
│ ┌───────────────┐ ┌───────────────┐ ┌─────────────────────────────────────────┐ │
|
||||
│ │ mysql-prod [▼]│ │ Critical [▼] │ │ Connection timeout │ │
|
||||
│ └───────────────┘ └───────────────┘ └─────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [Generate Fingerprint] │
|
||||
│ │
|
||||
│ Result: │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Fingerprint: a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6 │ │
|
||||
│ │ This would be a DUPLICATE of Alert #123: "MySQL connection timeout" │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Alert Detail Enhancement
|
||||
|
||||
Add deduplication info to Alert detail page.
|
||||
|
||||
**Wireframe Addition:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alert #123: MySQL connection timeout │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────┐ │
|
||||
│ │ DEDUPLICATION INFO │ │
|
||||
│ │ ────────────────────────────────────────── │ │
|
||||
│ │ │ │
|
||||
│ │ 🔢 Duplicate Count: 15 │ │
|
||||
│ │ This alert represents 16 total │ │
|
||||
│ │ occurrences (1 original + 15 dupes) │ │
|
||||
│ │ │ │
|
||||
│ │ 🕐 Last Duplicate: 10 minutes ago │ │
|
||||
│ │ │ │
|
||||
│ │ 🔑 Fingerprint: │ │
|
||||
│ │ a1b2c3d4e5f6... [Copy]│ │
|
||||
│ └──────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ // ... rest of alert details │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Alerts Table Enhancement
|
||||
|
||||
Add duplicate count column to alerts table.
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────┬──────────┬───────┬──────┬────────────┐│
|
||||
│ │ ID │ Title │ Severity │ Dupes │ State│ Age ││
|
||||
│ ├───────┼──────────────────────────────────┼──────────┼───────┼──────┼────────────┤│
|
||||
│ │ #127 │ MySQL connection timeout │ Critical │ x15 │ ● │ 2m ││
|
||||
│ │ #126 │ Disk space low │ Warning │ x3 │ ● │ 15m ││
|
||||
│ │ #125 │ API response slow │ High │ — │ ✓ │ 1h ││
|
||||
│ │ #124 │ Memory usage high │ Warning │ x47 │ ● │ 2h ││
|
||||
│ └───────┴──────────────────────────────────┴──────────┴───────┴──────┴────────────┘│
|
||||
│ │
|
||||
│ Dupes = Number of duplicate alerts suppressed │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. DeduplicationStatsCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/DeduplicationStatsCard.tsx`
|
||||
|
||||
Shows deduplication statistics in a card format.
|
||||
|
||||
```typescript
|
||||
interface DeduplicationStatsCardProps {
|
||||
totalAlerts: number;
|
||||
uniqueAlerts: number;
|
||||
duplicateCount: number;
|
||||
deduplicationRate: number;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. FingerprintFieldSelector
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/FingerprintFieldSelector.tsx`
|
||||
|
||||
Checkbox list for selecting fingerprint fields.
|
||||
|
||||
```typescript
|
||||
interface FingerprintFieldSelectorProps {
|
||||
selectedFields: Array<string>;
|
||||
onChange: (fields: Array<string>) => void;
|
||||
availableFields: Array<{
|
||||
field: string;
|
||||
label: string;
|
||||
description: string;
|
||||
}>;
|
||||
}
|
||||
```
|
||||
|
||||
### 3. FingerprintTester
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/FingerprintTester.tsx`
|
||||
|
||||
Form for testing fingerprint generation.
|
||||
|
||||
### 4. DuplicateCountBadge
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/DuplicateCountBadge.tsx`
|
||||
|
||||
Badge showing duplicate count.
|
||||
|
||||
```typescript
|
||||
interface DuplicateCountBadgeProps {
|
||||
count: number;
|
||||
showIfZero?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 5. DeduplicationInfoCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Deduplication/DeduplicationInfoCard.tsx`
|
||||
|
||||
Card for alert detail page showing deduplication info.
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to route configuration:
|
||||
|
||||
```typescript
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-deduplication',
|
||||
component: AlertDeduplicationPage,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Deduplication settings page
|
||||
|
||||
### Components
|
||||
- [ ] DeduplicationStatsCard
|
||||
- [ ] FingerprintFieldSelector
|
||||
- [ ] FingerprintTester
|
||||
- [ ] DuplicateCountBadge
|
||||
- [ ] DeduplicationInfoCard
|
||||
|
||||
### Existing Page Updates
|
||||
- [ ] Add duplicate count column to Alerts table
|
||||
- [ ] Add deduplication info to Alert detail page
|
||||
- [ ] Add sidebar navigation item
|
||||
|
||||
### Styling
|
||||
- [ ] Stats card styles
|
||||
- [ ] Badge styles
|
||||
- [ ] Field selector styles
|
||||
165
Docs/Plan/AlertDeduplication/README.md
Normal file
165
Docs/Plan/AlertDeduplication/README.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# Alert Deduplication Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Deduplication and Fingerprinting functionality for OneUptime. This feature prevents duplicate alerts from being created and tracks duplicate occurrences.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and deduplication engine |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is Alert Deduplication?
|
||||
|
||||
Alert Deduplication prevents the same alert from being created multiple times within a configurable time window. Instead of creating duplicate alerts, the system increments a counter on the original alert.
|
||||
|
||||
### How Fingerprinting Works
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alert Fingerprint Generation │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
Alert Data Fingerprint Fields Hash
|
||||
┌─────────────────┐ ┌───────────────────┐ ┌────────────┐
|
||||
│ monitorId: abc │ │ monitorId: abc │ │ │
|
||||
│ criteriaId: xyz │ ──► │ criteriaId: xyz │ ──► │ SHA-256 │
|
||||
│ severity: high │ │ severity: high │ │ = a1b2c3.. │
|
||||
│ title: "Error" │ │ title: "Error" │ │ │
|
||||
│ time: 10:00 AM │ │ (time excluded) │ │ │
|
||||
└─────────────────┘ └───────────────────┘ └────────────┘
|
||||
```
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Fingerprint Generation** - Compute unique hash from alert fields
|
||||
2. **Time-Window Deduplication** - Deduplicate within configurable window
|
||||
3. **Duplicate Counting** - Track how many duplicates were suppressed
|
||||
4. **Configurable Fields** - Choose which fields to include in fingerprint
|
||||
5. **Per-Project Settings** - Customize deduplication per project
|
||||
|
||||
### Benefits
|
||||
|
||||
| Without Deduplication | With Deduplication |
|
||||
|-----------------------|-------------------|
|
||||
| 100 identical alerts created | 1 alert with count: 100 |
|
||||
| 100 notifications sent | 1 notification sent |
|
||||
| Alert fatigue | Reduced noise |
|
||||
| Storage waste | Efficient storage |
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an operator, I want duplicate alerts to be automatically merged
|
||||
so that I don't see the same alert repeated 50 times.
|
||||
|
||||
As a team lead, I want to know how many times an alert occurred
|
||||
so that I can understand the severity of the issue.
|
||||
|
||||
As an SRE, I want to configure the deduplication window
|
||||
so that I can tune it for my team's workflow.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Core Fingerprinting (Week 1)
|
||||
|
||||
- [ ] Create FingerprintGenerator utility
|
||||
- [ ] Add fingerprint field to Alert model
|
||||
- [ ] Implement basic SHA-256 fingerprinting
|
||||
- [ ] Add duplicate count field to Alert
|
||||
|
||||
### Phase 2: Deduplication Engine (Week 2)
|
||||
|
||||
- [ ] Create AlertFingerprint cache model
|
||||
- [ ] Implement DeduplicationEngine
|
||||
- [ ] Integrate with AlertService
|
||||
- [ ] Add time-window support
|
||||
|
||||
### Phase 3: Configuration & UI (Week 3)
|
||||
|
||||
- [ ] Add project-level deduplication settings
|
||||
- [ ] Create deduplication configuration UI
|
||||
- [ ] Add duplicate count to Alert detail view
|
||||
- [ ] Add deduplication metrics
|
||||
|
||||
### Phase 4: Advanced Features (Week 4)
|
||||
|
||||
- [ ] Configurable fingerprint fields
|
||||
- [ ] Redis caching for fingerprints
|
||||
- [ ] Deduplication analytics dashboard
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Deduplication Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────┐
|
||||
│ Alert Trigger │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ FingerprintGenerator │
|
||||
│ .generate() │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ DeduplicationEngine │
|
||||
│ .checkDuplicate() │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
┌────────────────┴────────────────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ DUPLICATE │ │ NEW │
|
||||
│ │ │ │
|
||||
│ - Increment │ │ - Create alert │
|
||||
│ count on │ │ - Register │
|
||||
│ original │ │ fingerprint │
|
||||
│ - Skip creation │ │ - Send notifs │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```typescript
|
||||
interface DeduplicationConfig {
|
||||
// Enable/disable deduplication
|
||||
enabled: boolean;
|
||||
|
||||
// Time window for deduplication (minutes)
|
||||
windowMinutes: number; // Default: 60
|
||||
|
||||
// Fields to include in fingerprint
|
||||
fingerprintFields: Array<string>; // Default: ['monitorId', 'criteriaId', 'severity', 'title']
|
||||
|
||||
// Whether to normalize strings (lowercase, trim)
|
||||
normalizeStrings: boolean; // Default: true
|
||||
}
|
||||
```
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Duplicate detection accuracy | > 99% |
|
||||
| Fingerprint generation time | < 5ms |
|
||||
| Storage reduction | 30-50% |
|
||||
| Notification reduction | 40-60% |
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Alert Grouping Plan](../AlertGrouping/README.md)
|
||||
- [Alert Suppression Plan](../AlertSuppression/README.md)
|
||||
1047
Docs/Plan/AlertGrouping/1-DataModels.md
Normal file
1047
Docs/Plan/AlertGrouping/1-DataModels.md
Normal file
File diff suppressed because it is too large
Load Diff
1153
Docs/Plan/AlertGrouping/2-Backend.md
Normal file
1153
Docs/Plan/AlertGrouping/2-Backend.md
Normal file
File diff suppressed because it is too large
Load Diff
606
Docs/Plan/AlertGrouping/3-API.md
Normal file
606
Docs/Plan/AlertGrouping/3-API.md
Normal file
@@ -0,0 +1,606 @@
|
||||
# API Design for Alert Grouping
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Grouping / Episodes functionality.
|
||||
|
||||
## Base URLs
|
||||
|
||||
All endpoints are prefixed with the project scope:
|
||||
|
||||
```
|
||||
/api/project/{projectId}/alert-episode
|
||||
/api/project/{projectId}/alert-grouping-rule
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Episodes API
|
||||
|
||||
### List Episodes
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-episode
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `currentAlertStateId` | ObjectID | Filter by state |
|
||||
| `alertSeverityId` | ObjectID | Filter by severity |
|
||||
| `groupingRuleId` | ObjectID | Filter by grouping rule |
|
||||
| `startedAt` | DateRange | Filter by start time |
|
||||
| `search` | string | Search in title/description |
|
||||
| `limit` | number | Results per page (default: 10) |
|
||||
| `skip` | number | Pagination offset |
|
||||
| `sort` | string | Sort field (default: `-lastActivityAt`) |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "episode-id-1",
|
||||
"episodeNumber": 42,
|
||||
"title": "Database Connectivity Issues",
|
||||
"description": "Multiple database connection failures",
|
||||
"currentAlertState": {
|
||||
"_id": "state-id",
|
||||
"name": "Active",
|
||||
"color": "#FF0000"
|
||||
},
|
||||
"alertSeverity": {
|
||||
"_id": "severity-id",
|
||||
"name": "Critical",
|
||||
"color": "#FF0000"
|
||||
},
|
||||
"alertCount": 15,
|
||||
"uniqueMonitorCount": 3,
|
||||
"startedAt": "2026-01-20T10:45:00Z",
|
||||
"lastActivityAt": "2026-01-20T10:57:00Z",
|
||||
"groupingRule": {
|
||||
"_id": "rule-id",
|
||||
"name": "Database alerts - 5min"
|
||||
}
|
||||
}
|
||||
],
|
||||
"count": 55,
|
||||
"skip": 0,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Get Episode Details
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-episode/{episodeId}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "episode-id-1",
|
||||
"episodeNumber": 42,
|
||||
"title": "Database Connectivity Issues",
|
||||
"description": "Multiple database connection failures",
|
||||
"currentAlertState": {
|
||||
"_id": "state-id",
|
||||
"name": "Active",
|
||||
"color": "#FF0000"
|
||||
},
|
||||
"alertSeverity": {
|
||||
"_id": "severity-id",
|
||||
"name": "Critical",
|
||||
"color": "#FF0000"
|
||||
},
|
||||
"alertCount": 15,
|
||||
"uniqueMonitorCount": 3,
|
||||
"startedAt": "2026-01-20T10:45:00Z",
|
||||
"lastActivityAt": "2026-01-20T10:57:00Z",
|
||||
"acknowledgedAt": null,
|
||||
"resolvedAt": null,
|
||||
"groupingRule": {
|
||||
"_id": "rule-id",
|
||||
"name": "Database alerts - 5min"
|
||||
},
|
||||
"ownerUsers": [],
|
||||
"ownerTeams": [],
|
||||
"labels": [],
|
||||
"rootCause": null
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Create Episode (Manual)
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"title": "Custom Episode Title",
|
||||
"description": "Optional description"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:** Created episode object
|
||||
|
||||
---
|
||||
|
||||
### Update Episode
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-episode/{episodeId}
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"title": "Updated Title",
|
||||
"description": "Updated description",
|
||||
"ownerUsers": ["user-id-1"],
|
||||
"ownerTeams": ["team-id-1"],
|
||||
"labels": ["label-id-1"],
|
||||
"rootCause": "Database connection pool exhausted"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Delete Episode
|
||||
|
||||
```http
|
||||
DELETE /api/project/{projectId}/alert-episode/{episodeId}
|
||||
```
|
||||
|
||||
Deleting an episode removes all member relationships but does NOT delete the alerts themselves. Alerts will have their `episodeId` set to null.
|
||||
|
||||
---
|
||||
|
||||
### Acknowledge Episode
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode/{episodeId}/acknowledge
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"acknowledgeAlerts": true // Optional: also acknowledge all alerts
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "episode-id",
|
||||
"currentAlertState": {
|
||||
"_id": "acknowledged-state-id",
|
||||
"name": "Acknowledged"
|
||||
},
|
||||
"acknowledgedAt": "2026-01-20T11:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Resolve Episode
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode/{episodeId}/resolve
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"rootCause": "Database server restarted",
|
||||
"resolveAlerts": true // Optional: also resolve all alerts
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Get Episode Alerts
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-episode/{episodeId}/alerts
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
| `sort` | string | Sort field |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "alert-id-1",
|
||||
"alertNumber": 127,
|
||||
"title": "MySQL connection pool exhausted",
|
||||
"currentAlertState": { ... },
|
||||
"alertSeverity": { ... },
|
||||
"monitor": { ... },
|
||||
"createdAt": "2026-01-20T10:57:00Z",
|
||||
"episodeMembership": {
|
||||
"addedBy": "rule",
|
||||
"addedAt": "2026-01-20T10:57:00Z",
|
||||
"groupingRule": { "_id": "rule-id", "name": "Database alerts" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"count": 15,
|
||||
"skip": 0,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Add Alert to Episode
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode/{episodeId}/add-alert
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertId": "alert-id-to-add"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Remove Alert from Episode
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode/{episodeId}/remove-alert
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertId": "alert-id-to-remove"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Merge Episodes
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode/merge
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"targetEpisodeId": "episode-to-keep",
|
||||
"sourceEpisodeIds": ["episode-to-merge-1", "episode-to-merge-2"]
|
||||
}
|
||||
```
|
||||
|
||||
All alerts from source episodes are moved to the target episode. Source episodes are deleted.
|
||||
|
||||
---
|
||||
|
||||
### Split Episode
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-episode/{episodeId}/split
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertIds": ["alert-id-1", "alert-id-2"],
|
||||
"newEpisodeTitle": "Split Episode"
|
||||
}
|
||||
```
|
||||
|
||||
Creates a new episode with the specified alerts removed from the original episode.
|
||||
|
||||
---
|
||||
|
||||
### Get Episode Timeline
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-episode/{episodeId}/timeline
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"type": "alert_added",
|
||||
"timestamp": "2026-01-20T10:57:00Z",
|
||||
"description": "Alert #127 added to episode",
|
||||
"alert": { "_id": "alert-id", "title": "MySQL connection pool exhausted" },
|
||||
"addedBy": "rule"
|
||||
},
|
||||
{
|
||||
"type": "state_change",
|
||||
"timestamp": "2026-01-20T10:50:00Z",
|
||||
"description": "Assigned to John Smith",
|
||||
"user": { "_id": "user-id", "name": "John Smith" }
|
||||
},
|
||||
{
|
||||
"type": "episode_created",
|
||||
"timestamp": "2026-01-20T10:45:00Z",
|
||||
"description": "Episode created with 3 initial alerts",
|
||||
"groupingRule": { "_id": "rule-id", "name": "Database alerts - 5min" }
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Grouping Rules API
|
||||
|
||||
### List Grouping Rules
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-grouping-rule
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "rule-id-1",
|
||||
"name": "Database Alerts - 5 minute window",
|
||||
"description": "Groups database-related alerts within 5 minutes",
|
||||
"isEnabled": true,
|
||||
"priority": 1,
|
||||
"matchCriteria": {
|
||||
"labelIds": ["database-label-id"],
|
||||
"titlePattern": ".*(connection|database|mysql|postgres).*"
|
||||
},
|
||||
"groupingConfig": {
|
||||
"type": "time_window",
|
||||
"timeWindowMinutes": 5
|
||||
},
|
||||
"episodeConfig": {
|
||||
"titleTemplate": "{{severity}} - Database Issues",
|
||||
"autoResolveWhenEmpty": true,
|
||||
"breakAfterMinutesInactive": 60
|
||||
}
|
||||
}
|
||||
],
|
||||
"count": 3
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Get Grouping Rule
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-grouping-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Create Grouping Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-grouping-rule
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Database Alerts - 5 minute window",
|
||||
"description": "Groups database-related alerts within 5 minutes",
|
||||
"isEnabled": true,
|
||||
"priority": 1,
|
||||
"matchCriteria": {
|
||||
"severityIds": ["critical-id", "high-id"],
|
||||
"labelIds": ["database-label-id"],
|
||||
"titlePattern": ".*(connection|database).*"
|
||||
},
|
||||
"groupingConfig": {
|
||||
"type": "time_window",
|
||||
"timeWindowMinutes": 5
|
||||
},
|
||||
"episodeConfig": {
|
||||
"titleTemplate": "{{severity}} - Database Issues",
|
||||
"autoResolveWhenEmpty": true,
|
||||
"breakAfterMinutesInactive": 60
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Update Grouping Rule
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-grouping-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Delete Grouping Rule
|
||||
|
||||
```http
|
||||
DELETE /api/project/{projectId}/alert-grouping-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Enable/Disable Grouping Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-grouping-rule/{ruleId}/enable
|
||||
POST /api/project/{projectId}/alert-grouping-rule/{ruleId}/disable
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Test Grouping Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-grouping-rule/{ruleId}/test
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertIds": ["alert-id-1", "alert-id-2", "alert-id-3"]
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"matchedAlerts": [
|
||||
{ "_id": "alert-id-1", "title": "MySQL timeout", "wouldMatch": true },
|
||||
{ "_id": "alert-id-2", "title": "API error", "wouldMatch": false },
|
||||
{ "_id": "alert-id-3", "title": "PostgreSQL error", "wouldMatch": true }
|
||||
],
|
||||
"wouldCreateEpisodes": 1,
|
||||
"groupingPreview": [
|
||||
{
|
||||
"episodeTitle": "Critical - Database Issues",
|
||||
"alerts": ["alert-id-1", "alert-id-3"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Existing Alert API Changes
|
||||
|
||||
### Alert Response Enhancement
|
||||
|
||||
The existing Alert response will include episode information:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "alert-id",
|
||||
"alertNumber": 127,
|
||||
"title": "MySQL connection pool exhausted",
|
||||
"episode": {
|
||||
"_id": "episode-id",
|
||||
"episodeNumber": 42,
|
||||
"title": "Database Connectivity Issues"
|
||||
},
|
||||
"fingerprint": "abc123...",
|
||||
"duplicateCount": 5
|
||||
}
|
||||
```
|
||||
|
||||
### Filter Alerts by Episode
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert?episodeId={episodeId}
|
||||
```
|
||||
|
||||
### Get Ungrouped Alerts
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert?episodeId=null
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Implementation Notes
|
||||
|
||||
### Permissions
|
||||
|
||||
| Endpoint | Required Permission |
|
||||
|----------|---------------------|
|
||||
| GET episodes | `ProjectMember` |
|
||||
| Create/Update/Delete episodes | `ProjectAdmin` |
|
||||
| Acknowledge/Resolve episodes | `ProjectMember` |
|
||||
| GET grouping rules | `ProjectMember` |
|
||||
| Create/Update/Delete grouping rules | `ProjectAdmin` |
|
||||
|
||||
### Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "EPISODE_NOT_FOUND",
|
||||
"message": "Episode with ID xxx not found"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Common error codes:
|
||||
- `EPISODE_NOT_FOUND` - Episode doesn't exist
|
||||
- `ALERT_NOT_FOUND` - Alert doesn't exist
|
||||
- `ALERT_ALREADY_IN_EPISODE` - Alert is already part of an episode
|
||||
- `CANNOT_MERGE_RESOLVED` - Cannot merge resolved episodes
|
||||
- `INVALID_GROUPING_CONFIG` - Invalid grouping rule configuration
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
Standard API rate limits apply. Batch operations (merge, bulk add) count as multiple operations.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Episode API
|
||||
- [ ] GET /alert-episode (list)
|
||||
- [ ] GET /alert-episode/:id (details)
|
||||
- [ ] POST /alert-episode (create)
|
||||
- [ ] PUT /alert-episode/:id (update)
|
||||
- [ ] DELETE /alert-episode/:id (delete)
|
||||
- [ ] POST /alert-episode/:id/acknowledge
|
||||
- [ ] POST /alert-episode/:id/resolve
|
||||
- [ ] GET /alert-episode/:id/alerts
|
||||
- [ ] POST /alert-episode/:id/add-alert
|
||||
- [ ] POST /alert-episode/:id/remove-alert
|
||||
- [ ] POST /alert-episode/merge
|
||||
- [ ] POST /alert-episode/:id/split
|
||||
- [ ] GET /alert-episode/:id/timeline
|
||||
|
||||
### Grouping Rule API
|
||||
- [ ] GET /alert-grouping-rule (list)
|
||||
- [ ] GET /alert-grouping-rule/:id (details)
|
||||
- [ ] POST /alert-grouping-rule (create)
|
||||
- [ ] PUT /alert-grouping-rule/:id (update)
|
||||
- [ ] DELETE /alert-grouping-rule/:id (delete)
|
||||
- [ ] POST /alert-grouping-rule/:id/enable
|
||||
- [ ] POST /alert-grouping-rule/:id/disable
|
||||
- [ ] POST /alert-grouping-rule/:id/test
|
||||
|
||||
### Alert API Updates
|
||||
- [ ] Add episode field to alert response
|
||||
- [ ] Add episodeId filter to alert list
|
||||
- [ ] Add fingerprint field to alert response
|
||||
669
Docs/Plan/AlertGrouping/4-UI.md
Normal file
669
Docs/Plan/AlertGrouping/4-UI.md
Normal file
@@ -0,0 +1,669 @@
|
||||
# UI Implementation for Alert Grouping
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Grouping / Episodes functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
├── Alerts
|
||||
│ ├── All Alerts (existing)
|
||||
│ └── Episodes (NEW)
|
||||
└── Settings
|
||||
├── Alerts
|
||||
│ ├── Alert States (existing)
|
||||
│ ├── Alert Severities (existing)
|
||||
│ └── Grouping Rules (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Episodes List Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/Episodes.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/episodes`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts > Episodes [+ Create Episode] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌────────┬──────────────┬────────────┬───────┐ ┌─────────────────────────────┐ │
|
||||
│ │ Active │ Acknowledged │ Resolved │ All │ │ 🔍 Search episodes... │ │
|
||||
│ │ (5) │ (2) │ (48) │ (55) │ └─────────────────────────────┘ │
|
||||
│ └────────┴──────────────┴────────────┴───────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ● EP-42 Database Connectivity Issues 🔴 Critical │ │
|
||||
│ │ ┌─────────────────────────────────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 15 alerts │ 3 monitors │ Started 10 min ago │ Last activity: 2 min ago │ │ │
|
||||
│ │ └─────────────────────────────────────────────────────────────────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ Preview: │ │
|
||||
│ │ • Alert #123: MySQL connection timeout on web-server-1 │ │
|
||||
│ │ • Alert #124: MySQL connection timeout on web-server-2 │ │
|
||||
│ │ • Alert #125: PostgreSQL connection refused on api-server │ │
|
||||
│ │ └── +12 more alerts │ │
|
||||
│ │ │ │
|
||||
│ │ Rule: "Group database alerts within 5 min" [Acknowledge] [Resolve] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ● EP-41 High CPU Utilization 🟠 High │ │
|
||||
│ │ ... │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [1] [2] [3] ... [Next →] │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Implementation:**
|
||||
|
||||
```typescript
|
||||
// /Dashboard/src/Pages/Alerts/Episodes.tsx
|
||||
|
||||
import React, { FunctionComponent, ReactElement } from 'react';
|
||||
import PageComponentProps from '../PageComponentProps';
|
||||
import ModelTable from 'Common/UI/Components/ModelTable/ModelTable';
|
||||
import AlertEpisode from 'Common/Models/DatabaseModels/AlertEpisode';
|
||||
import FieldType from 'Common/UI/Components/Types/FieldType';
|
||||
import Navigation from 'Common/UI/Utils/Navigation';
|
||||
import DashboardNavigation from '../../Utils/Navigation';
|
||||
import AlertSeverity from 'Common/Models/DatabaseModels/AlertSeverity';
|
||||
import AlertState from 'Common/Models/DatabaseModels/AlertState';
|
||||
import Pill from 'Common/UI/Components/Pill/Pill';
|
||||
import { Black } from 'Common/Types/BrandColors';
|
||||
|
||||
const EpisodesPage: FunctionComponent<PageComponentProps> = (
|
||||
props: PageComponentProps
|
||||
): ReactElement => {
|
||||
return (
|
||||
<ModelTable<AlertEpisode>
|
||||
modelType={AlertEpisode}
|
||||
id="episodes-table"
|
||||
isDeleteable={true}
|
||||
isEditable={false}
|
||||
isCreateable={true}
|
||||
isViewable={true}
|
||||
name="Episodes"
|
||||
query={{
|
||||
projectId: DashboardNavigation.getProjectId()!,
|
||||
}}
|
||||
cardProps={{
|
||||
title: 'Episodes',
|
||||
description:
|
||||
'Episodes group related alerts together for easier management.',
|
||||
}}
|
||||
selectMoreFields={{
|
||||
alertCount: true,
|
||||
uniqueMonitorCount: true,
|
||||
startedAt: true,
|
||||
lastActivityAt: true,
|
||||
}}
|
||||
columns={[
|
||||
{
|
||||
field: {
|
||||
episodeNumber: true,
|
||||
},
|
||||
title: 'Episode',
|
||||
type: FieldType.Text,
|
||||
getElement: (item: AlertEpisode): ReactElement => {
|
||||
return (
|
||||
<span className="font-medium">
|
||||
EP-{item.episodeNumber}
|
||||
</span>
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
field: {
|
||||
title: true,
|
||||
},
|
||||
title: 'Title',
|
||||
type: FieldType.Text,
|
||||
},
|
||||
{
|
||||
field: {
|
||||
currentAlertState: {
|
||||
name: true,
|
||||
color: true,
|
||||
},
|
||||
},
|
||||
title: 'State',
|
||||
type: FieldType.Entity,
|
||||
getElement: (item: AlertEpisode): ReactElement => {
|
||||
if (!item.currentAlertState) {
|
||||
return <></>;
|
||||
}
|
||||
return (
|
||||
<Pill
|
||||
text={item.currentAlertState.name || ''}
|
||||
color={item.currentAlertState.color || Black}
|
||||
/>
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
field: {
|
||||
alertSeverity: {
|
||||
name: true,
|
||||
color: true,
|
||||
},
|
||||
},
|
||||
title: 'Severity',
|
||||
type: FieldType.Entity,
|
||||
getElement: (item: AlertEpisode): ReactElement => {
|
||||
if (!item.alertSeverity) {
|
||||
return <></>;
|
||||
}
|
||||
return (
|
||||
<Pill
|
||||
text={item.alertSeverity.name || ''}
|
||||
color={item.alertSeverity.color || Black}
|
||||
/>
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
field: {
|
||||
alertCount: true,
|
||||
},
|
||||
title: 'Alerts',
|
||||
type: FieldType.Number,
|
||||
},
|
||||
{
|
||||
field: {
|
||||
lastActivityAt: true,
|
||||
},
|
||||
title: 'Last Activity',
|
||||
type: FieldType.DateTime,
|
||||
},
|
||||
]}
|
||||
filters={[
|
||||
{
|
||||
field: {
|
||||
currentAlertState: {
|
||||
_id: true,
|
||||
},
|
||||
},
|
||||
title: 'State',
|
||||
type: FieldType.Entity,
|
||||
filterEntityType: AlertState,
|
||||
filterQuery: {
|
||||
projectId: DashboardNavigation.getProjectId()!,
|
||||
},
|
||||
},
|
||||
{
|
||||
field: {
|
||||
alertSeverity: {
|
||||
_id: true,
|
||||
},
|
||||
},
|
||||
title: 'Severity',
|
||||
type: FieldType.Entity,
|
||||
filterEntityType: AlertSeverity,
|
||||
filterQuery: {
|
||||
projectId: DashboardNavigation.getProjectId()!,
|
||||
},
|
||||
},
|
||||
]}
|
||||
onViewPage={(item: AlertEpisode): void => {
|
||||
Navigation.navigate(
|
||||
DashboardNavigation.getAlertEpisodeViewRoute(item._id!)
|
||||
);
|
||||
}}
|
||||
/>
|
||||
);
|
||||
};
|
||||
|
||||
export default EpisodesPage;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Episode Detail Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/EpisodeView/Index.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/episodes/:episodeId`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ← Episodes EP-42: Database Connectivity Issues │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────┐ ┌──────────────────────────────┐ │
|
||||
│ │ Status │ 🔴 Active │ │ Actions │ │
|
||||
│ │ Severity │ Critical │ │ ┌────────────────────────┐ │ │
|
||||
│ │ Started │ Jan 20, 2026 10:45 AM │ │ │ [Acknowledge] │ │ │
|
||||
│ │ Last Activity │ 2 min ago │ │ │ [Resolve] │ │ │
|
||||
│ │ Alert Count │ 15 │ │ │ [Add Alert] │ │ │
|
||||
│ │ Monitors │ 3 │ │ │ [Merge Episodes] │ │ │
|
||||
│ └──────────────────────────────────────────────┘ └──────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Tabs: [Overview] [Alerts (15)] [Timeline] [Settings] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ OVERVIEW TAB: │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Description [Edit] │ │
|
||||
│ │ Multiple database connection failures affecting production services │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Assigned To [Edit] │ │
|
||||
│ │ 👤 John Smith (DBA Team) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Root Cause Analysis [Edit] │ │
|
||||
│ │ Database connection pool exhausted due to connection leak in payment service │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Grouping Rule │ │
|
||||
│ │ "Database alerts - 5min" (Time Window: 5 minutes) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Sub-pages:**
|
||||
|
||||
| Route | Component | Description |
|
||||
|-------|-----------|-------------|
|
||||
| `/episodes/:id` | Overview | Episode details, owners, root cause |
|
||||
| `/episodes/:id/alerts` | Alerts | List of alerts in episode |
|
||||
| `/episodes/:id/timeline` | Timeline | Episode activity timeline |
|
||||
| `/episodes/:id/settings` | Settings | Delete episode |
|
||||
|
||||
---
|
||||
|
||||
### 3. Episode Alerts Tab
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/EpisodeView/Alerts.tsx`
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ALERTS TAB: [+ Add Alert] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────────────────┬──────────┬───────┬──────┐ │
|
||||
│ │ ID │ Title │ Monitor │ State │ ··· │ │
|
||||
│ ├───────┼──────────────────────────────────────────────┼──────────┼───────┼──────┤ │
|
||||
│ │ #127 │ MySQL connection pool exhausted │ mysql-01 │ ● Act │ [x] │ │
|
||||
│ │ #126 │ MySQL connection timeout │ web-02 │ ● Act │ [x] │ │
|
||||
│ │ #125 │ PostgreSQL connection refused │ api-01 │ ✓ Res │ [x] │ │
|
||||
│ │ #124 │ MySQL connection timeout │ web-02 │ ● Act │ [x] │ │
|
||||
│ │ #123 │ MySQL connection timeout │ web-01 │ ● Act │ [x] │ │
|
||||
│ └───────┴──────────────────────────────────────────────┴──────────┴───────┴──────┘ │
|
||||
│ │
|
||||
│ Note: [x] = Remove from episode button │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Grouping Rules Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertGroupingRules.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-grouping-rules`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Alert Grouping Rules [+ Create Rule] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Grouping rules automatically combine related alerts into Episodes. │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Database Alerts - 5 minute window Priority: 1 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ Type: Time Window (5 minutes) │ │
|
||||
│ │ Matches: Monitors with label "database" │ │
|
||||
│ │ Episodes created: 23 │ Alerts grouped: 156 │ │
|
||||
│ │ [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ❌ Smart Grouping (Disabled) Priority: 2 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ Type: Smart (80% similarity) │ │
|
||||
│ │ Matches: All critical alerts │ │
|
||||
│ │ [Enable] [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. Create/Edit Grouping Rule Form
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertGroupingRuleView/Index.tsx`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Create Grouping Rule │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ BASIC INFORMATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Rule Name * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Database Alerts - 5 minute window │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Description │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Groups database-related alerts within 5 minutes │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Priority (lower = evaluated first) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 1 │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ MATCHING CRITERIA │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ Which alerts should this rule apply to? │
|
||||
│ │
|
||||
│ Severities (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [Critical ×] [High ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Labels (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [database ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Title Pattern (regex, optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ .*(connection|database|mysql|postgres).* │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ GROUPING METHOD │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Grouping Type * │
|
||||
│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │
|
||||
│ │ ● Time Window │ │ ○ Field-Based │ │ ○ Smart (Beta) │ │
|
||||
│ └──────────────────┘ └──────────────────┘ └──────────────────┘ │
|
||||
│ │
|
||||
│ Time Window (minutes) * │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 5 │ Alerts arriving within this window will be grouped together. │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ EPISODE SETTINGS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Episode Title Template │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ {{severity}} - Database Issues │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ Available: {{severity}}, {{monitor}}, {{alertCount}} │
|
||||
│ │
|
||||
│ ☑ Auto-resolve episode when all alerts are resolved │
|
||||
│ │
|
||||
│ Break episode after inactive for (minutes) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 60 │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ [Cancel] [Test Rule] [Save] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Existing Page Modifications
|
||||
|
||||
### 1. Alerts Table Enhancement
|
||||
|
||||
Add Episode column to the existing Alerts table.
|
||||
|
||||
**File:** `/Dashboard/src/Pages/Alerts/View/Index.tsx`
|
||||
|
||||
```typescript
|
||||
// Add to columns array:
|
||||
{
|
||||
field: {
|
||||
episode: {
|
||||
_id: true,
|
||||
episodeNumber: true,
|
||||
title: true,
|
||||
},
|
||||
},
|
||||
title: 'Episode',
|
||||
type: FieldType.Entity,
|
||||
getElement: (item: Alert): ReactElement => {
|
||||
if (!item.episode) {
|
||||
return <span className="text-gray-400">—</span>;
|
||||
}
|
||||
return (
|
||||
<Link
|
||||
to={DashboardNavigation.getAlertEpisodeViewRoute(
|
||||
item.episode._id!
|
||||
)}
|
||||
>
|
||||
EP-{item.episode.episodeNumber}
|
||||
</Link>
|
||||
);
|
||||
},
|
||||
},
|
||||
```
|
||||
|
||||
### 2. Alert Detail Page Enhancement
|
||||
|
||||
Show episode membership on alert detail page.
|
||||
|
||||
**File:** `/Dashboard/src/Pages/Alerts/AlertView/Index.tsx`
|
||||
|
||||
Add a card showing:
|
||||
- Episode badge (if part of episode)
|
||||
- Link to episode detail
|
||||
- Button to remove from episode
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. EpisodeCard Component
|
||||
|
||||
**File:** `/Dashboard/src/Components/Episode/EpisodeCard.tsx`
|
||||
|
||||
Reusable card for displaying episode summary.
|
||||
|
||||
```typescript
|
||||
interface EpisodeCardProps {
|
||||
episode: AlertEpisode;
|
||||
showAlertPreview?: boolean;
|
||||
onAcknowledge?: () => void;
|
||||
onResolve?: () => void;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. EpisodeBadge Component
|
||||
|
||||
**File:** `/Dashboard/src/Components/Episode/EpisodeBadge.tsx`
|
||||
|
||||
Small badge showing episode number and link.
|
||||
|
||||
```typescript
|
||||
interface EpisodeBadgeProps {
|
||||
episodeNumber: number;
|
||||
episodeId: ObjectID;
|
||||
}
|
||||
```
|
||||
|
||||
### 3. AddAlertToEpisodeModal Component
|
||||
|
||||
**File:** `/Dashboard/src/Components/Episode/AddAlertToEpisodeModal.tsx`
|
||||
|
||||
Modal for manually adding alerts to an episode.
|
||||
|
||||
### 4. MergeEpisodesModal Component
|
||||
|
||||
**File:** `/Dashboard/src/Components/Episode/MergeEpisodesModal.tsx`
|
||||
|
||||
Modal for merging multiple episodes.
|
||||
|
||||
### 5. GroupingRuleForm Component
|
||||
|
||||
**File:** `/Dashboard/src/Components/GroupingRule/GroupingRuleForm.tsx`
|
||||
|
||||
Form for creating/editing grouping rules with:
|
||||
- Match criteria builder
|
||||
- Grouping type selector
|
||||
- Episode config options
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to `/Dashboard/src/Routes/AlertRoutes.tsx`:
|
||||
|
||||
```typescript
|
||||
// Episode routes
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/episodes',
|
||||
component: EpisodesPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/episodes/:episodeId',
|
||||
component: EpisodeViewLayout,
|
||||
children: [
|
||||
{
|
||||
path: '',
|
||||
component: EpisodeOverview,
|
||||
},
|
||||
{
|
||||
path: 'alerts',
|
||||
component: EpisodeAlerts,
|
||||
},
|
||||
{
|
||||
path: 'timeline',
|
||||
component: EpisodeTimeline,
|
||||
},
|
||||
{
|
||||
path: 'settings',
|
||||
component: EpisodeSettings,
|
||||
},
|
||||
],
|
||||
},
|
||||
```
|
||||
|
||||
Add to `/Dashboard/src/Routes/SettingsRoutes.tsx`:
|
||||
|
||||
```typescript
|
||||
// Grouping rule routes
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-grouping-rules',
|
||||
component: AlertGroupingRulesPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-grouping-rules/:ruleId',
|
||||
component: AlertGroupingRuleViewLayout,
|
||||
},
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Navigation Helper Updates
|
||||
|
||||
Add to `/Dashboard/src/Utils/Navigation.ts`:
|
||||
|
||||
```typescript
|
||||
public static getAlertEpisodesRoute(projectId?: ObjectID): Route {
|
||||
return new Route(`/dashboard/${projectId?.toString()}/alerts/episodes`);
|
||||
}
|
||||
|
||||
public static getAlertEpisodeViewRoute(episodeId: ObjectID): Route {
|
||||
return new Route(
|
||||
`/dashboard/${this.getProjectId()?.toString()}/alerts/episodes/${episodeId.toString()}`
|
||||
);
|
||||
}
|
||||
|
||||
public static getAlertGroupingRulesRoute(): Route {
|
||||
return new Route(
|
||||
`/dashboard/${this.getProjectId()?.toString()}/settings/alert-grouping-rules`
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sidebar Menu Updates
|
||||
|
||||
Add to Alerts section in `/Dashboard/src/Components/Sidebar/Sidebar.tsx`:
|
||||
|
||||
```typescript
|
||||
{
|
||||
title: 'Episodes',
|
||||
route: RouteMap.AlertEpisodes,
|
||||
icon: IconProp.Layers,
|
||||
}
|
||||
```
|
||||
|
||||
Add to Settings > Alerts section:
|
||||
|
||||
```typescript
|
||||
{
|
||||
title: 'Grouping Rules',
|
||||
route: RouteMap.AlertGroupingRules,
|
||||
icon: IconProp.Layers,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Episodes list page
|
||||
- [ ] Episode detail page (overview)
|
||||
- [ ] Episode alerts tab
|
||||
- [ ] Episode timeline tab
|
||||
- [ ] Episode settings tab
|
||||
- [ ] Grouping rules list page
|
||||
- [ ] Grouping rule detail/edit page
|
||||
|
||||
### Components
|
||||
- [ ] EpisodeCard component
|
||||
- [ ] EpisodeBadge component
|
||||
- [ ] AddAlertToEpisodeModal
|
||||
- [ ] MergeEpisodesModal
|
||||
- [ ] GroupingRuleForm
|
||||
- [ ] GroupingTypeSelector
|
||||
|
||||
### Existing Page Updates
|
||||
- [ ] Add Episode column to Alerts table
|
||||
- [ ] Add Episode card to Alert detail page
|
||||
- [ ] Add sidebar navigation items
|
||||
- [ ] Update route configuration
|
||||
|
||||
### Styling
|
||||
- [ ] Episode card styles
|
||||
- [ ] Episode badge styles
|
||||
- [ ] Grouping rule form styles
|
||||
- [ ] Timeline component styles
|
||||
888
Docs/Plan/AlertGrouping/5-Migration.md
Normal file
888
Docs/Plan/AlertGrouping/5-Migration.md
Normal file
@@ -0,0 +1,888 @@
|
||||
# Migration & Rollout Plan for Alert Grouping
|
||||
|
||||
## Overview
|
||||
|
||||
This document outlines the database migrations, feature flags, and rollout strategy for Alert Grouping / Episodes functionality.
|
||||
|
||||
## Database Migrations
|
||||
|
||||
### Migration 1: Create AlertGroupingRule Table
|
||||
|
||||
**File:** `/Common/Server/Infrastructure/Postgres/SchemaMigrations/XXXX-CreateAlertGroupingRule.ts`
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, Table, TableIndex } from 'typeorm';
|
||||
|
||||
export class CreateAlertGroupingRule implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertGroupingRule',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'description',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'isEnabled',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
},
|
||||
{
|
||||
name: 'matchCriteria',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'groupingConfig',
|
||||
type: 'jsonb',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'episodeConfig',
|
||||
type: 'jsonb',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'priority',
|
||||
type: 'integer',
|
||||
default: 100,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertGroupingRule',
|
||||
new TableIndex({
|
||||
name: 'idx_grouping_rule_project_enabled',
|
||||
columnNames: ['projectId', 'isEnabled', 'priority'],
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertGroupingRule');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Migration 2: Create AlertEpisode Table
|
||||
|
||||
**File:** `/Common/Server/Infrastructure/Postgres/SchemaMigrations/XXXX-CreateAlertEpisode.ts`
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, Table, TableIndex, TableForeignKey } from 'typeorm';
|
||||
|
||||
export class CreateAlertEpisode implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertEpisode',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'episodeNumber',
|
||||
type: 'integer',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'title',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'description',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'groupingRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'currentAlertStateId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'alertSeverityId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'startedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'lastActivityAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'acknowledgedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'resolvedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'alertCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
},
|
||||
{
|
||||
name: 'uniqueMonitorCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
},
|
||||
{
|
||||
name: 'rootCause',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'AlertEpisode',
|
||||
new TableIndex({
|
||||
name: 'idx_episode_project_state',
|
||||
columnNames: ['projectId', 'currentAlertStateId', 'lastActivityAt'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertEpisode',
|
||||
new TableIndex({
|
||||
name: 'idx_episode_grouping_rule',
|
||||
columnNames: ['projectId', 'groupingRuleId', 'currentAlertStateId'],
|
||||
})
|
||||
);
|
||||
|
||||
// Foreign keys
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisode',
|
||||
new TableForeignKey({
|
||||
columnNames: ['projectId'],
|
||||
referencedTableName: 'Project',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisode',
|
||||
new TableForeignKey({
|
||||
columnNames: ['groupingRuleId'],
|
||||
referencedTableName: 'AlertGroupingRule',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'SET NULL',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisode',
|
||||
new TableForeignKey({
|
||||
columnNames: ['currentAlertStateId'],
|
||||
referencedTableName: 'AlertState',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'SET NULL',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisode',
|
||||
new TableForeignKey({
|
||||
columnNames: ['alertSeverityId'],
|
||||
referencedTableName: 'AlertSeverity',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'SET NULL',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertEpisode');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Migration 3: Create AlertEpisodeMember Table
|
||||
|
||||
**File:** `/Common/Server/Infrastructure/Postgres/SchemaMigrations/XXXX-CreateAlertEpisodeMember.ts`
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, Table, TableIndex, TableForeignKey } from 'typeorm';
|
||||
|
||||
export class CreateAlertEpisodeMember implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertEpisodeMember',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'episodeId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'alertId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'addedBy',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'addedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'groupingRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'similarityScore',
|
||||
type: 'decimal',
|
||||
precision: 5,
|
||||
scale: 4,
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'AlertEpisodeMember',
|
||||
new TableIndex({
|
||||
name: 'idx_episode_member_episode',
|
||||
columnNames: ['episodeId', 'addedAt'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertEpisodeMember',
|
||||
new TableIndex({
|
||||
name: 'idx_episode_member_alert',
|
||||
columnNames: ['alertId'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertEpisodeMember',
|
||||
new TableIndex({
|
||||
name: 'idx_episode_member_unique',
|
||||
columnNames: ['episodeId', 'alertId'],
|
||||
isUnique: true,
|
||||
})
|
||||
);
|
||||
|
||||
// Foreign keys
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeMember',
|
||||
new TableForeignKey({
|
||||
columnNames: ['projectId'],
|
||||
referencedTableName: 'Project',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeMember',
|
||||
new TableForeignKey({
|
||||
columnNames: ['episodeId'],
|
||||
referencedTableName: 'AlertEpisode',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeMember',
|
||||
new TableForeignKey({
|
||||
columnNames: ['alertId'],
|
||||
referencedTableName: 'Alert',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertEpisodeMember');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Migration 4: Add Episode Fields to Alert Table
|
||||
|
||||
**File:** `/Common/Server/Infrastructure/Postgres/SchemaMigrations/XXXX-AddEpisodeFieldsToAlert.ts`
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, TableColumn, TableIndex, TableForeignKey } from 'typeorm';
|
||||
|
||||
export class AddEpisodeFieldsToAlert implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
// Add episodeId column
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'episodeId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
})
|
||||
);
|
||||
|
||||
// Add fingerprint column
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'fingerprint',
|
||||
type: 'varchar',
|
||||
length: '64',
|
||||
isNullable: true,
|
||||
})
|
||||
);
|
||||
|
||||
// Add duplicateCount column
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'duplicateCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
})
|
||||
);
|
||||
|
||||
// Add lastDuplicateAt column
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'lastDuplicateAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
})
|
||||
);
|
||||
|
||||
// Create indexes
|
||||
await queryRunner.createIndex(
|
||||
'Alert',
|
||||
new TableIndex({
|
||||
name: 'idx_alert_episode',
|
||||
columnNames: ['episodeId'],
|
||||
where: '"episodeId" IS NOT NULL',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'Alert',
|
||||
new TableIndex({
|
||||
name: 'idx_alert_fingerprint',
|
||||
columnNames: ['projectId', 'fingerprint'],
|
||||
where: '"fingerprint" IS NOT NULL',
|
||||
})
|
||||
);
|
||||
|
||||
// Create foreign key
|
||||
await queryRunner.createForeignKey(
|
||||
'Alert',
|
||||
new TableForeignKey({
|
||||
name: 'fk_alert_episode',
|
||||
columnNames: ['episodeId'],
|
||||
referencedTableName: 'AlertEpisode',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'SET NULL',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropForeignKey('Alert', 'fk_alert_episode');
|
||||
await queryRunner.dropIndex('Alert', 'idx_alert_fingerprint');
|
||||
await queryRunner.dropIndex('Alert', 'idx_alert_episode');
|
||||
await queryRunner.dropColumn('Alert', 'lastDuplicateAt');
|
||||
await queryRunner.dropColumn('Alert', 'duplicateCount');
|
||||
await queryRunner.dropColumn('Alert', 'fingerprint');
|
||||
await queryRunner.dropColumn('Alert', 'episodeId');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Migration 5: Create Episode Join Tables
|
||||
|
||||
**File:** `/Common/Server/Infrastructure/Postgres/SchemaMigrations/XXXX-CreateEpisodeJoinTables.ts`
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, Table, TableForeignKey } from 'typeorm';
|
||||
|
||||
export class CreateEpisodeJoinTables implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
// AlertEpisodeOwnerUser join table
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertEpisodeOwnerUser',
|
||||
columns: [
|
||||
{
|
||||
name: 'episodeId',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
},
|
||||
{
|
||||
name: 'userId',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeOwnerUser',
|
||||
new TableForeignKey({
|
||||
columnNames: ['episodeId'],
|
||||
referencedTableName: 'AlertEpisode',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeOwnerUser',
|
||||
new TableForeignKey({
|
||||
columnNames: ['userId'],
|
||||
referencedTableName: 'User',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
// AlertEpisodeOwnerTeam join table
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertEpisodeOwnerTeam',
|
||||
columns: [
|
||||
{
|
||||
name: 'episodeId',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
},
|
||||
{
|
||||
name: 'teamId',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeOwnerTeam',
|
||||
new TableForeignKey({
|
||||
columnNames: ['episodeId'],
|
||||
referencedTableName: 'AlertEpisode',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeOwnerTeam',
|
||||
new TableForeignKey({
|
||||
columnNames: ['teamId'],
|
||||
referencedTableName: 'Team',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
// AlertEpisodeLabel join table
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertEpisodeLabel',
|
||||
columns: [
|
||||
{
|
||||
name: 'episodeId',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
},
|
||||
{
|
||||
name: 'labelId',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeLabel',
|
||||
new TableForeignKey({
|
||||
columnNames: ['episodeId'],
|
||||
referencedTableName: 'AlertEpisode',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertEpisodeLabel',
|
||||
new TableForeignKey({
|
||||
columnNames: ['labelId'],
|
||||
referencedTableName: 'Label',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertEpisodeLabel');
|
||||
await queryRunner.dropTable('AlertEpisodeOwnerTeam');
|
||||
await queryRunner.dropTable('AlertEpisodeOwnerUser');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Feature Flags
|
||||
|
||||
### Project-Level Settings
|
||||
|
||||
Add to Project model or create AlertGroupingSettings:
|
||||
|
||||
```typescript
|
||||
interface AlertGroupingSettings {
|
||||
// Master switch
|
||||
groupingEnabled: boolean;
|
||||
|
||||
// Auto-create episodes for new alerts
|
||||
autoCreateEpisodes: boolean;
|
||||
|
||||
// Default time window for grouping (minutes)
|
||||
defaultTimeWindowMinutes: number;
|
||||
}
|
||||
```
|
||||
|
||||
### Implementation
|
||||
|
||||
```typescript
|
||||
// /Common/Server/Services/AlertGroupingSettingsService.ts
|
||||
|
||||
export default class AlertGroupingSettingsService {
|
||||
public static async isGroupingEnabled(projectId: ObjectID): Promise<boolean> {
|
||||
const settings = await ProjectService.findOneById({
|
||||
id: projectId,
|
||||
select: { alertGroupingEnabled: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return settings?.alertGroupingEnabled ?? false;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Usage in GroupingEngine
|
||||
|
||||
```typescript
|
||||
// In GroupingEngine.processAlert():
|
||||
const isEnabled = await AlertGroupingSettingsService.isGroupingEnabled(projectId);
|
||||
if (!isEnabled) {
|
||||
return { shouldGroup: false, isNewEpisode: false };
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollout Strategy
|
||||
|
||||
### Phase 1: Internal Alpha
|
||||
|
||||
**Duration:** 1 week
|
||||
|
||||
**Scope:**
|
||||
- Enable for internal test projects only
|
||||
- Feature flag: `ALERT_GROUPING_INTERNAL_ONLY=true`
|
||||
|
||||
**Validation:**
|
||||
- Verify migrations run successfully
|
||||
- Test basic grouping flow
|
||||
- Check performance metrics
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Beta (Opt-in)
|
||||
|
||||
**Duration:** 2 weeks
|
||||
|
||||
**Scope:**
|
||||
- Available to all projects but disabled by default
|
||||
- Users must explicitly enable in Settings
|
||||
- Show "Beta" badge on Episodes page
|
||||
|
||||
**Communication:**
|
||||
- In-app announcement
|
||||
- Documentation published
|
||||
- Support team briefed
|
||||
|
||||
**Monitoring:**
|
||||
- Episode creation rate
|
||||
- Grouping accuracy feedback
|
||||
- Performance metrics
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: General Availability
|
||||
|
||||
**Duration:** Ongoing
|
||||
|
||||
**Scope:**
|
||||
- Enabled by default for new projects
|
||||
- Existing projects can opt-in via Settings
|
||||
|
||||
**Milestones:**
|
||||
- Remove "Beta" badge
|
||||
- Enable by default for all new projects
|
||||
- Provide migration tool for existing alerts
|
||||
|
||||
---
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
### No Breaking Changes
|
||||
|
||||
1. **Existing alerts unchanged** - episodeId is nullable, defaults to null
|
||||
2. **Existing API unchanged** - new fields added but not required
|
||||
3. **Opt-in only** - grouping disabled until rules created
|
||||
|
||||
### Gradual Adoption
|
||||
|
||||
1. Users create grouping rules when ready
|
||||
2. Only new alerts are grouped (after rule creation)
|
||||
3. No retroactive grouping unless explicitly triggered
|
||||
|
||||
---
|
||||
|
||||
## Data Migration (Optional)
|
||||
|
||||
### Retroactive Alert Grouping
|
||||
|
||||
For users who want to group existing alerts:
|
||||
|
||||
```typescript
|
||||
// /Worker/Jobs/AlertEpisode/RetroactiveGrouping.ts
|
||||
|
||||
export async function retroactivelyGroupAlerts(
|
||||
projectId: ObjectID,
|
||||
ruleId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<void> {
|
||||
// Get rule
|
||||
const rule = await AlertGroupingRuleService.findOneById({ id: ruleId });
|
||||
|
||||
// Get alerts in date range
|
||||
const alerts = await AlertService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(startDate, endDate),
|
||||
episodeId: QueryHelper.isNull(),
|
||||
},
|
||||
select: { ... },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Group alerts
|
||||
for (const alert of alerts) {
|
||||
await GroupingEngine.processAlert(alert, projectId);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This would be triggered via Admin UI or API endpoint.
|
||||
|
||||
---
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
### Database Rollback
|
||||
|
||||
If issues discovered, migrations can be rolled back:
|
||||
|
||||
```bash
|
||||
npm run migration:revert
|
||||
```
|
||||
|
||||
### Feature Flag Disable
|
||||
|
||||
Immediately disable grouping for all projects:
|
||||
|
||||
```bash
|
||||
# Set environment variable
|
||||
ALERT_GROUPING_GLOBAL_DISABLE=true
|
||||
```
|
||||
|
||||
### Data Preservation
|
||||
|
||||
- Episodes and members remain in database
|
||||
- Alerts keep episodeId reference
|
||||
- Can be re-enabled later without data loss
|
||||
|
||||
---
|
||||
|
||||
## Monitoring & Alerts
|
||||
|
||||
### Key Metrics
|
||||
|
||||
| Metric | Description | Threshold |
|
||||
|--------|-------------|-----------|
|
||||
| `episode_creation_rate` | Episodes created per hour | Monitor for anomalies |
|
||||
| `grouping_latency_p99` | Time to group an alert | < 50ms |
|
||||
| `episode_alert_ratio` | Avg alerts per episode | > 2 (effective grouping) |
|
||||
| `grouping_engine_errors` | Errors in grouping | 0 |
|
||||
|
||||
### Dashboards
|
||||
|
||||
Create monitoring dashboards for:
|
||||
- Episode creation over time
|
||||
- Grouping rule effectiveness
|
||||
- Performance metrics
|
||||
- Error rates
|
||||
|
||||
---
|
||||
|
||||
## Checklist
|
||||
|
||||
### Pre-Migration
|
||||
- [ ] Review migration scripts
|
||||
- [ ] Test migrations on staging
|
||||
- [ ] Backup production database
|
||||
- [ ] Prepare rollback procedure
|
||||
|
||||
### Migration
|
||||
- [ ] Run migrations in order
|
||||
- [ ] Verify table creation
|
||||
- [ ] Verify index creation
|
||||
- [ ] Verify foreign keys
|
||||
|
||||
### Post-Migration
|
||||
- [ ] Deploy updated API
|
||||
- [ ] Deploy updated Worker
|
||||
- [ ] Deploy updated Dashboard
|
||||
- [ ] Enable feature flags for alpha
|
||||
- [ ] Monitor metrics
|
||||
|
||||
### GA Release
|
||||
- [ ] Remove beta badges
|
||||
- [ ] Update documentation
|
||||
- [ ] Enable for new projects
|
||||
- [ ] Announce to users
|
||||
117
Docs/Plan/AlertGrouping/README.md
Normal file
117
Docs/Plan/AlertGrouping/README.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# Alert Grouping / Episodes Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Grouping and Episodes functionality for OneUptime. This feature groups related alerts into logical containers called "Episodes" to reduce noise and help operators focus on root causes rather than individual symptoms.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and grouping engine |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
| [5-Migration.md](./5-Migration.md) | Database migrations and rollout |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is an Episode?
|
||||
|
||||
An **Episode** is a container that groups related alerts together. Instead of seeing 50 individual "connection timeout" alerts, operators see one episode: "Database Connectivity Issues (50 alerts)".
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Automatic Grouping** - Rules-based grouping of alerts into episodes
|
||||
2. **Time-Window Grouping** - Group alerts occurring within N minutes
|
||||
3. **Field-Based Grouping** - Group by monitor, severity, labels, etc.
|
||||
4. **Manual Management** - Merge, split, add/remove alerts from episodes
|
||||
5. **Episode Lifecycle** - Active → Acknowledged → Resolved states
|
||||
6. **Root Cause Tracking** - Document root cause analysis per episode
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an operator, I want to see related alerts grouped together
|
||||
so that I can focus on root causes instead of individual symptoms.
|
||||
|
||||
As an operator, I want to acknowledge an entire episode at once
|
||||
so that I don't have to acknowledge each alert individually.
|
||||
|
||||
As a team lead, I want to configure grouping rules
|
||||
so that alerts are automatically organized by our team's workflow.
|
||||
|
||||
As an operator, I want to document the root cause of an episode
|
||||
so that the team can learn from past incidents.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Core Models & Basic Grouping (Week 1-2)
|
||||
|
||||
- [ ] Create AlertEpisode model
|
||||
- [ ] Create AlertEpisodeMember model
|
||||
- [ ] Create AlertGroupingRule model
|
||||
- [ ] Implement basic time-window grouping engine
|
||||
- [ ] Integrate with alert creation flow
|
||||
|
||||
### Phase 2: Episode Management (Week 3)
|
||||
|
||||
- [ ] Episode state management (acknowledge, resolve)
|
||||
- [ ] Episode assignment (owners, teams)
|
||||
- [ ] Episode timeline tracking
|
||||
- [ ] Manual alert management (add/remove)
|
||||
|
||||
### Phase 3: UI - List & Detail Views (Week 4-5)
|
||||
|
||||
- [ ] Episodes list page
|
||||
- [ ] Episode detail page
|
||||
- [ ] Episode actions (acknowledge, resolve, assign)
|
||||
- [ ] Alert-to-episode linking in alerts table
|
||||
|
||||
### Phase 4: UI - Configuration (Week 6)
|
||||
|
||||
- [ ] Grouping rules list page
|
||||
- [ ] Create/edit grouping rule form
|
||||
- [ ] Rule testing functionality
|
||||
- [ ] Episode badge in alerts table
|
||||
|
||||
### Phase 5: Advanced Features (Week 7-8)
|
||||
|
||||
- [ ] Field-based grouping
|
||||
- [ ] Episode merge/split functionality
|
||||
- [ ] Episode notifications
|
||||
- [ ] Analytics and metrics
|
||||
|
||||
## Dependencies
|
||||
|
||||
### Existing Components Used
|
||||
|
||||
- `Alert` model and `AlertService`
|
||||
- `AlertState` and `AlertStateTimeline`
|
||||
- Dashboard routing and layout components
|
||||
- ModelTable and ModelForm components
|
||||
- On-call notification system
|
||||
|
||||
### New Components Created
|
||||
|
||||
- `AlertEpisode` model
|
||||
- `AlertEpisodeMember` model
|
||||
- `AlertGroupingRule` model
|
||||
- `GroupingEngine` service
|
||||
- Episode UI pages and components
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Alert-to-episode ratio | 5:1 or higher |
|
||||
| Episode acknowledgment time | 50% faster than individual alerts |
|
||||
| User adoption | 80% of projects with grouping rules |
|
||||
| Processing latency | < 30ms added to alert creation |
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Splunk ITSI Episode Review](https://docs.splunk.com/Documentation/ITSI)
|
||||
- [PagerDuty Alert Grouping](https://support.pagerduty.com/docs/alert-grouping)
|
||||
774
Docs/Plan/AlertStormDetection/1-DataModels.md
Normal file
774
Docs/Plan/AlertStormDetection/1-DataModels.md
Normal file
@@ -0,0 +1,774 @@
|
||||
# Data Models for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the database models required for Alert Storm Detection and Noise Reduction Analytics functionality.
|
||||
|
||||
## Entity Relationship Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────┐
|
||||
│ AlertStormEvent │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ status (active/resolved)│
|
||||
│ startedAt │
|
||||
│ endedAt │
|
||||
│ peakAlertRate │
|
||||
│ normalAlertRate │
|
||||
│ multiplier │
|
||||
│ affectedMonitors (JSON) │
|
||||
│ totalAlertsInStorm │
|
||||
└─────────────────────────┘
|
||||
|
||||
┌─────────────────────────┐
|
||||
│ NoiseReductionMetric │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ date │
|
||||
│ totalAlerts │
|
||||
│ deduplicated │
|
||||
│ suppressed │
|
||||
│ grouped │
|
||||
│ notificationsSent │
|
||||
│ noiseReductionPercent │
|
||||
└─────────────────────────┘
|
||||
|
||||
┌─────────────────────────┐
|
||||
│ AlertVolumeSnapshot │
|
||||
├─────────────────────────┤
|
||||
│ id │
|
||||
│ projectId │
|
||||
│ timestamp │
|
||||
│ alertCount │
|
||||
│ intervalMinutes │
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Definitions
|
||||
|
||||
### 1. AlertStormEvent
|
||||
|
||||
Records storm events for tracking and analysis.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/AlertStormEvent.ts`
|
||||
|
||||
```typescript
|
||||
import {
|
||||
Column,
|
||||
Entity,
|
||||
Index,
|
||||
JoinColumn,
|
||||
ManyToOne,
|
||||
} from 'typeorm';
|
||||
import BaseModel from './DatabaseBaseModel/DatabaseBaseModel';
|
||||
import Project from './Project';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import ColumnType from 'Common/Types/Database/ColumnType';
|
||||
import TableColumnType from 'Common/Types/Database/TableColumnType';
|
||||
import Permission from 'Common/Types/Permission';
|
||||
import IconProp from 'Common/Types/Icon/IconProp';
|
||||
|
||||
export enum StormStatus {
|
||||
Active = 'active',
|
||||
Resolved = 'resolved',
|
||||
}
|
||||
|
||||
export enum StormSeverity {
|
||||
Elevated = 'elevated', // 2x - 3x normal
|
||||
Storm = 'storm', // 3x - 5x normal
|
||||
Critical = 'critical', // > 5x normal
|
||||
}
|
||||
|
||||
export interface AffectedMonitor {
|
||||
monitorId: string;
|
||||
monitorName: string;
|
||||
alertCount: number;
|
||||
}
|
||||
|
||||
@EnableDocumentation()
|
||||
@TableAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
delete: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
})
|
||||
@CrudApiEndpoint(new Route('/alert-storm-event'))
|
||||
@TableMetadata({
|
||||
tableName: 'AlertStormEvent',
|
||||
singularName: 'Storm Event',
|
||||
pluralName: 'Storm Events',
|
||||
icon: IconProp.Alert,
|
||||
tableDescription: 'Records of alert storm events',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertStormEvent',
|
||||
})
|
||||
export default class AlertStormEvent extends BaseModel {
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// PROJECT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Entity,
|
||||
modelType: Project,
|
||||
title: 'Project',
|
||||
})
|
||||
@ManyToOne(() => Project, {
|
||||
onDelete: 'CASCADE',
|
||||
orphanedRowAction: 'delete',
|
||||
})
|
||||
@JoinColumn({ name: 'projectId' })
|
||||
public project?: Project = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// STATUS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Status',
|
||||
description: 'Current status of the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 20,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public status?: StormStatus = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ShortText,
|
||||
title: 'Severity',
|
||||
description: 'Severity level of the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ShortText,
|
||||
length: 20,
|
||||
nullable: false,
|
||||
})
|
||||
public severity?: StormSeverity = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// TIMING
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Started At',
|
||||
description: 'When the storm was first detected',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public startedAt?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Ended At',
|
||||
description: 'When the storm ended',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: true,
|
||||
})
|
||||
public endedAt?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Duration Minutes',
|
||||
description: 'Total duration of the storm in minutes',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: true,
|
||||
})
|
||||
public durationMinutes?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Peak Alert Rate',
|
||||
description: 'Peak alerts per hour during storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
})
|
||||
public peakAlertRate?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Normal Alert Rate',
|
||||
description: 'Normal alerts per hour (baseline)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
})
|
||||
public normalAlertRate?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Multiplier',
|
||||
description: 'How many times normal the peak rate was',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Decimal,
|
||||
precision: 5,
|
||||
scale: 2,
|
||||
nullable: false,
|
||||
})
|
||||
public multiplier?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Total Alerts',
|
||||
description: 'Total alerts during the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public totalAlertsInStorm?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// AFFECTED MONITORS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.JSON,
|
||||
title: 'Affected Monitors',
|
||||
description: 'Top monitors contributing to the storm',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.JSON,
|
||||
nullable: true,
|
||||
})
|
||||
public affectedMonitors?: Array<AffectedMonitor> = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// SUPPRESSION
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Boolean,
|
||||
title: 'Emergency Suppression Active',
|
||||
description: 'Whether emergency suppression was activated',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Boolean,
|
||||
nullable: false,
|
||||
default: false,
|
||||
})
|
||||
public emergencySuppressionActive?: boolean = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed During Storm',
|
||||
description: 'Alerts suppressed during storm (if emergency suppression active)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressedDuringStorm?: number = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. NoiseReductionMetric
|
||||
|
||||
Daily metrics for noise reduction analytics.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/NoiseReductionMetric.ts`
|
||||
|
||||
```typescript
|
||||
@TableMetadata({
|
||||
tableName: 'NoiseReductionMetric',
|
||||
singularName: 'Noise Reduction Metric',
|
||||
pluralName: 'Noise Reduction Metrics',
|
||||
icon: IconProp.ChartBar,
|
||||
tableDescription: 'Daily noise reduction statistics',
|
||||
})
|
||||
@Entity({
|
||||
name: 'NoiseReductionMetric',
|
||||
})
|
||||
export default class NoiseReductionMetric extends BaseModel {
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// PROJECT
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DATE
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Date',
|
||||
description: 'Date for these metrics',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public date?: Date = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// ALERT COUNTS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Total Alert Triggers',
|
||||
description: 'Total number of alert triggers (before noise reduction)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public totalAlertTriggers?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Alerts Created',
|
||||
description: 'Actual alerts created (after deduplication/suppression)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public alertsCreated?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// DEDUPLICATION METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Deduplicated',
|
||||
description: 'Alerts prevented by deduplication',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public deduplicated?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// SUPPRESSION METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed',
|
||||
description: 'Alerts prevented by suppression rules',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressed?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed by Maintenance',
|
||||
description: 'Alerts suppressed by maintenance windows',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressedByMaintenance?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Suppressed by Rate Limit',
|
||||
description: 'Alerts suppressed by rate limits',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public suppressedByRateLimit?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// GROUPING METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Alerts Grouped',
|
||||
description: 'Alerts grouped into episodes',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public alertsGrouped?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Episodes Created',
|
||||
description: 'Number of episodes created',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public episodesCreated?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// NOTIFICATION METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Notifications Sent',
|
||||
description: 'Actual notifications sent (after all filtering)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public notificationsSent?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Notifications Suppressed',
|
||||
description: 'Notifications that were suppressed',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public notificationsSuppressed?: number = undefined;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
// CALCULATED METRICS
|
||||
// ─────────────────────────────────────────────────────────────
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin, Permission.ProjectMember],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Noise Reduction Percent',
|
||||
description: 'Overall noise reduction percentage',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Decimal,
|
||||
precision: 5,
|
||||
scale: 2,
|
||||
nullable: false,
|
||||
default: 0,
|
||||
})
|
||||
public noiseReductionPercent?: number = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. AlertVolumeSnapshot
|
||||
|
||||
Periodic snapshots of alert volume for trend analysis.
|
||||
|
||||
**File Location:** `/Common/Models/DatabaseModels/AlertVolumeSnapshot.ts`
|
||||
|
||||
```typescript
|
||||
@TableMetadata({
|
||||
tableName: 'AlertVolumeSnapshot',
|
||||
singularName: 'Volume Snapshot',
|
||||
pluralName: 'Volume Snapshots',
|
||||
icon: IconProp.ChartLine,
|
||||
tableDescription: 'Periodic alert volume snapshots',
|
||||
})
|
||||
@Entity({
|
||||
name: 'AlertVolumeSnapshot',
|
||||
})
|
||||
export default class AlertVolumeSnapshot extends BaseModel {
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.ObjectID,
|
||||
title: 'Project ID',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.ObjectID,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public projectId?: ObjectID = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Date,
|
||||
title: 'Timestamp',
|
||||
description: 'When this snapshot was taken',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Date,
|
||||
nullable: false,
|
||||
})
|
||||
@Index()
|
||||
public timestamp?: Date = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Alert Count',
|
||||
description: 'Number of alerts in this interval',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
})
|
||||
public alertCount?: number = undefined;
|
||||
|
||||
@ColumnAccessControl({
|
||||
create: [],
|
||||
read: [Permission.ProjectOwner, Permission.ProjectAdmin],
|
||||
update: [],
|
||||
})
|
||||
@TableColumn({
|
||||
type: TableColumnType.Number,
|
||||
title: 'Interval Minutes',
|
||||
description: 'Interval size in minutes (e.g., 5, 15, 60)',
|
||||
})
|
||||
@Column({
|
||||
type: ColumnType.Number,
|
||||
nullable: false,
|
||||
default: 5,
|
||||
})
|
||||
public intervalMinutes?: number = undefined;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Indexes
|
||||
|
||||
```sql
|
||||
-- AlertStormEvent indexes
|
||||
CREATE INDEX idx_storm_event_project_status
|
||||
ON "AlertStormEvent" ("projectId", "status", "startedAt" DESC);
|
||||
|
||||
CREATE INDEX idx_storm_event_active
|
||||
ON "AlertStormEvent" ("projectId", "status")
|
||||
WHERE "status" = 'active';
|
||||
|
||||
-- NoiseReductionMetric indexes
|
||||
CREATE INDEX idx_noise_metric_project_date
|
||||
ON "NoiseReductionMetric" ("projectId", "date" DESC);
|
||||
|
||||
CREATE UNIQUE INDEX idx_noise_metric_unique
|
||||
ON "NoiseReductionMetric" ("projectId", "date");
|
||||
|
||||
-- AlertVolumeSnapshot indexes
|
||||
CREATE INDEX idx_volume_snapshot_project_time
|
||||
ON "AlertVolumeSnapshot" ("projectId", "timestamp" DESC);
|
||||
|
||||
-- Partition by time for efficient cleanup
|
||||
-- Consider partitioning AlertVolumeSnapshot by month
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
- [ ] Create AlertStormEvent model
|
||||
- [ ] Create NoiseReductionMetric model
|
||||
- [ ] Create AlertVolumeSnapshot model
|
||||
- [ ] Register models in model registry
|
||||
- [ ] Create database migrations
|
||||
- [ ] Add indexes
|
||||
- [ ] Update API permissions
|
||||
630
Docs/Plan/AlertStormDetection/2-Backend.md
Normal file
630
Docs/Plan/AlertStormDetection/2-Backend.md
Normal file
@@ -0,0 +1,630 @@
|
||||
# Backend Implementation for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the backend services and components required for Alert Storm Detection and Noise Reduction Analytics.
|
||||
|
||||
## Core Components
|
||||
|
||||
### 1. StormDetector
|
||||
|
||||
Main service for detecting alert storms.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/StormDetector.ts`
|
||||
|
||||
```typescript
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import AlertStormEventService from '../../Services/AlertStormEventService';
|
||||
import AlertStormEvent, {
|
||||
StormStatus,
|
||||
StormSeverity,
|
||||
AffectedMonitor,
|
||||
} from '../../Models/DatabaseModels/AlertStormEvent';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
|
||||
export interface StormStatus {
|
||||
isStorm: boolean;
|
||||
severity: StormSeverity | null;
|
||||
currentRate: number;
|
||||
normalRate: number;
|
||||
multiplier: number;
|
||||
affectedMonitors?: Array<AffectedMonitor>;
|
||||
activeStormEvent?: AlertStormEvent;
|
||||
}
|
||||
|
||||
export interface StormConfig {
|
||||
// Multiplier threshold for storm detection
|
||||
stormThreshold: number; // Default: 3
|
||||
|
||||
// Multiplier threshold for critical storm
|
||||
criticalThreshold: number; // Default: 5
|
||||
|
||||
// Minimum alerts per hour to consider for storm
|
||||
minimumAlertRate: number; // Default: 10
|
||||
|
||||
// Historical lookback hours for baseline
|
||||
baselineHours: number; // Default: 24
|
||||
|
||||
// Enable emergency suppression
|
||||
enableEmergencySuppression: boolean; // Default: false
|
||||
}
|
||||
|
||||
export const DEFAULT_STORM_CONFIG: StormConfig = {
|
||||
stormThreshold: 3,
|
||||
criticalThreshold: 5,
|
||||
minimumAlertRate: 10,
|
||||
baselineHours: 24,
|
||||
enableEmergencySuppression: false,
|
||||
};
|
||||
|
||||
export default class StormDetector {
|
||||
/**
|
||||
* Check current storm status for a project
|
||||
*/
|
||||
public static async checkStatus(
|
||||
projectId: ObjectID,
|
||||
config?: Partial<StormConfig>
|
||||
): Promise<StormStatus> {
|
||||
const mergedConfig = { ...DEFAULT_STORM_CONFIG, ...config };
|
||||
|
||||
const now = new Date();
|
||||
const oneHourAgo = OneUptimeDate.addRemoveHours(now, -1);
|
||||
const baselineStart = OneUptimeDate.addRemoveHours(now, -mergedConfig.baselineHours);
|
||||
|
||||
// Get current hour's alert count
|
||||
const currentCount = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.greaterThan(oneHourAgo),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Get historical average (excluding current hour)
|
||||
const historicalCount = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(baselineStart, oneHourAgo),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const hoursInBaseline = mergedConfig.baselineHours - 1;
|
||||
const normalRate = hoursInBaseline > 0
|
||||
? historicalCount / hoursInBaseline
|
||||
: mergedConfig.minimumAlertRate;
|
||||
|
||||
const currentRate = currentCount;
|
||||
const multiplier = normalRate > 0 ? currentRate / normalRate : currentRate;
|
||||
|
||||
// Determine storm status
|
||||
let isStorm = false;
|
||||
let severity: StormSeverity | null = null;
|
||||
|
||||
if (multiplier >= mergedConfig.criticalThreshold) {
|
||||
isStorm = true;
|
||||
severity = StormSeverity.Critical;
|
||||
} else if (multiplier >= mergedConfig.stormThreshold) {
|
||||
isStorm = true;
|
||||
severity = StormSeverity.Storm;
|
||||
} else if (multiplier >= 2) {
|
||||
severity = StormSeverity.Elevated;
|
||||
}
|
||||
|
||||
// Only consider it a storm if rate is above minimum
|
||||
if (currentRate < mergedConfig.minimumAlertRate) {
|
||||
isStorm = false;
|
||||
severity = null;
|
||||
}
|
||||
|
||||
// Get affected monitors if storm
|
||||
let affectedMonitors: Array<AffectedMonitor> | undefined;
|
||||
if (isStorm) {
|
||||
affectedMonitors = await this.getTopAlertingMonitors(projectId, oneHourAgo);
|
||||
}
|
||||
|
||||
// Check for active storm event
|
||||
const activeStormEvent = await AlertStormEventService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
status: StormStatus.Active,
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
startedAt: true,
|
||||
peakAlertRate: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return {
|
||||
isStorm,
|
||||
severity,
|
||||
currentRate,
|
||||
normalRate: Math.round(normalRate * 100) / 100,
|
||||
multiplier: Math.round(multiplier * 100) / 100,
|
||||
affectedMonitors,
|
||||
activeStormEvent: activeStormEvent || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top alerting monitors
|
||||
*/
|
||||
private static async getTopAlertingMonitors(
|
||||
projectId: ObjectID,
|
||||
since: Date
|
||||
): Promise<Array<AffectedMonitor>> {
|
||||
const result = await AlertService.aggregate({
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
projectId: projectId.toString(),
|
||||
createdAt: { $gte: since },
|
||||
monitorId: { $ne: null },
|
||||
},
|
||||
},
|
||||
{
|
||||
$group: {
|
||||
_id: '$monitorId',
|
||||
count: { $sum: 1 },
|
||||
},
|
||||
},
|
||||
{ $sort: { count: -1 } },
|
||||
{ $limit: 10 },
|
||||
],
|
||||
});
|
||||
|
||||
// Get monitor names
|
||||
const monitorIds = result.map((r) => new ObjectID(r._id));
|
||||
const monitors = await MonitorService.findBy({
|
||||
query: {
|
||||
_id: QueryHelper.any(monitorIds),
|
||||
},
|
||||
select: { _id: true, name: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const monitorMap = new Map(
|
||||
monitors.map((m) => [m.id?.toString(), m.name])
|
||||
);
|
||||
|
||||
return result.map((r) => ({
|
||||
monitorId: r._id,
|
||||
monitorName: monitorMap.get(r._id) || 'Unknown',
|
||||
alertCount: r.count,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Process storm detection and create/update storm events
|
||||
*/
|
||||
public static async processStormDetection(
|
||||
projectId: ObjectID,
|
||||
config?: Partial<StormConfig>
|
||||
): Promise<void> {
|
||||
const status = await this.checkStatus(projectId, config);
|
||||
|
||||
if (status.isStorm && !status.activeStormEvent) {
|
||||
// New storm detected - create event
|
||||
await this.createStormEvent(projectId, status);
|
||||
} else if (status.isStorm && status.activeStormEvent) {
|
||||
// Storm ongoing - update event
|
||||
await this.updateStormEvent(status.activeStormEvent.id!, status);
|
||||
} else if (!status.isStorm && status.activeStormEvent) {
|
||||
// Storm ended - resolve event
|
||||
await this.resolveStormEvent(status.activeStormEvent.id!);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new storm event
|
||||
*/
|
||||
private static async createStormEvent(
|
||||
projectId: ObjectID,
|
||||
status: StormStatus
|
||||
): Promise<AlertStormEvent> {
|
||||
const event = await AlertStormEventService.create({
|
||||
data: {
|
||||
projectId,
|
||||
status: StormStatus.Active,
|
||||
severity: status.severity!,
|
||||
startedAt: new Date(),
|
||||
peakAlertRate: status.currentRate,
|
||||
normalAlertRate: status.normalRate,
|
||||
multiplier: status.multiplier,
|
||||
affectedMonitors: status.affectedMonitors,
|
||||
totalAlertsInStorm: status.currentRate,
|
||||
} as AlertStormEvent,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Send notifications
|
||||
await NotificationService.sendStormStartNotification({
|
||||
projectId,
|
||||
stormEvent: event,
|
||||
});
|
||||
|
||||
logger.info(`Storm detected for project ${projectId}: ${status.multiplier}x normal`);
|
||||
|
||||
return event;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update an ongoing storm event
|
||||
*/
|
||||
private static async updateStormEvent(
|
||||
eventId: ObjectID,
|
||||
status: StormStatus
|
||||
): Promise<void> {
|
||||
const event = await AlertStormEventService.findOneById({
|
||||
id: eventId,
|
||||
select: { peakAlertRate: true, totalAlertsInStorm: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (!event) return;
|
||||
|
||||
await AlertStormEventService.updateOneById({
|
||||
id: eventId,
|
||||
data: {
|
||||
peakAlertRate: Math.max(event.peakAlertRate || 0, status.currentRate),
|
||||
multiplier: Math.max(event.multiplier || 0, status.multiplier),
|
||||
totalAlertsInStorm: (event.totalAlertsInStorm || 0) + status.currentRate,
|
||||
affectedMonitors: status.affectedMonitors,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a storm event
|
||||
*/
|
||||
private static async resolveStormEvent(eventId: ObjectID): Promise<void> {
|
||||
const event = await AlertStormEventService.findOneById({
|
||||
id: eventId,
|
||||
select: { startedAt: true, projectId: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (!event) return;
|
||||
|
||||
const now = new Date();
|
||||
const durationMinutes = Math.round(
|
||||
(now.getTime() - event.startedAt!.getTime()) / 60000
|
||||
);
|
||||
|
||||
await AlertStormEventService.updateOneById({
|
||||
id: eventId,
|
||||
data: {
|
||||
status: StormStatus.Resolved,
|
||||
endedAt: now,
|
||||
durationMinutes,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Send notification
|
||||
await NotificationService.sendStormEndNotification({
|
||||
projectId: event.projectId!,
|
||||
stormEventId: eventId,
|
||||
durationMinutes,
|
||||
});
|
||||
|
||||
logger.info(`Storm resolved for project ${event.projectId} after ${durationMinutes} minutes`);
|
||||
}
|
||||
}
|
||||
|
||||
import MonitorService from '../../Services/MonitorService';
|
||||
import NotificationService from '../../Services/NotificationService';
|
||||
import logger from '../../Utils/Logger';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. NoiseReductionAnalytics
|
||||
|
||||
Service for calculating and retrieving noise reduction metrics.
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/NoiseReductionAnalytics.ts`
|
||||
|
||||
```typescript
|
||||
import NoiseReductionMetric from '../../Models/DatabaseModels/NoiseReductionMetric';
|
||||
import NoiseReductionMetricService from '../../Services/NoiseReductionMetricService';
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import SuppressedAlertLogService from '../../Services/SuppressedAlertLogService';
|
||||
import AlertFingerprintService from '../../Services/AlertFingerprintService';
|
||||
import AlertEpisodeService from '../../Services/AlertEpisodeService';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
|
||||
export interface NoiseReductionSummary {
|
||||
period: {
|
||||
startDate: Date;
|
||||
endDate: Date;
|
||||
};
|
||||
totalAlertTriggers: number;
|
||||
alertsCreated: number;
|
||||
deduplicated: number;
|
||||
suppressed: number;
|
||||
grouped: number;
|
||||
notificationsSent: number;
|
||||
noiseReductionPercent: number;
|
||||
}
|
||||
|
||||
export default class NoiseReductionAnalytics {
|
||||
/**
|
||||
* Calculate daily noise reduction metrics for a project
|
||||
*/
|
||||
public static async calculateDailyMetrics(
|
||||
projectId: ObjectID,
|
||||
date: Date
|
||||
): Promise<NoiseReductionMetric> {
|
||||
const startOfDay = OneUptimeDate.getStartOfDay(date);
|
||||
const endOfDay = OneUptimeDate.getEndOfDay(date);
|
||||
|
||||
// Count alerts created
|
||||
const alertsCreated = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Count deduplicated
|
||||
const fingerprints = await AlertFingerprintService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
windowStartAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
select: { duplicateCount: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
const deduplicated = fingerprints.reduce(
|
||||
(sum, fp) => sum + (fp.duplicateCount || 0),
|
||||
0
|
||||
);
|
||||
|
||||
// Count suppressed
|
||||
const suppressed = await SuppressedAlertLogService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
suppressedAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Count grouped alerts
|
||||
const alertsGrouped = await AlertService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
createdAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
episodeId: QueryHelper.notNull(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Count episodes created
|
||||
const episodesCreated = await AlertEpisodeService.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
startedAt: QueryHelper.between(startOfDay, endOfDay),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Calculate totals
|
||||
const totalAlertTriggers = alertsCreated + deduplicated + suppressed;
|
||||
const noiseReductionPercent = totalAlertTriggers > 0
|
||||
? ((deduplicated + suppressed) / totalAlertTriggers) * 100
|
||||
: 0;
|
||||
|
||||
// Create or update metric
|
||||
const existingMetric = await NoiseReductionMetricService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
date: startOfDay,
|
||||
},
|
||||
select: { _id: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const metricData: Partial<NoiseReductionMetric> = {
|
||||
projectId,
|
||||
date: startOfDay,
|
||||
totalAlertTriggers,
|
||||
alertsCreated,
|
||||
deduplicated,
|
||||
suppressed,
|
||||
alertsGrouped,
|
||||
episodesCreated,
|
||||
noiseReductionPercent: Math.round(noiseReductionPercent * 100) / 100,
|
||||
};
|
||||
|
||||
if (existingMetric) {
|
||||
await NoiseReductionMetricService.updateOneById({
|
||||
id: existingMetric.id!,
|
||||
data: metricData,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
return { ...existingMetric, ...metricData } as NoiseReductionMetric;
|
||||
}
|
||||
|
||||
return await NoiseReductionMetricService.create({
|
||||
data: metricData as NoiseReductionMetric,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get noise reduction summary for a date range
|
||||
*/
|
||||
public static async getSummary(
|
||||
projectId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<NoiseReductionSummary> {
|
||||
const metrics = await NoiseReductionMetricService.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
date: QueryHelper.between(startDate, endDate),
|
||||
},
|
||||
select: {
|
||||
totalAlertTriggers: true,
|
||||
alertsCreated: true,
|
||||
deduplicated: true,
|
||||
suppressed: true,
|
||||
alertsGrouped: true,
|
||||
notificationsSent: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const totals = metrics.reduce(
|
||||
(acc, m) => ({
|
||||
totalAlertTriggers: acc.totalAlertTriggers + (m.totalAlertTriggers || 0),
|
||||
alertsCreated: acc.alertsCreated + (m.alertsCreated || 0),
|
||||
deduplicated: acc.deduplicated + (m.deduplicated || 0),
|
||||
suppressed: acc.suppressed + (m.suppressed || 0),
|
||||
grouped: acc.grouped + (m.alertsGrouped || 0),
|
||||
notificationsSent: acc.notificationsSent + (m.notificationsSent || 0),
|
||||
}),
|
||||
{
|
||||
totalAlertTriggers: 0,
|
||||
alertsCreated: 0,
|
||||
deduplicated: 0,
|
||||
suppressed: 0,
|
||||
grouped: 0,
|
||||
notificationsSent: 0,
|
||||
}
|
||||
);
|
||||
|
||||
const noiseReductionPercent = totals.totalAlertTriggers > 0
|
||||
? ((totals.deduplicated + totals.suppressed) / totals.totalAlertTriggers) * 100
|
||||
: 0;
|
||||
|
||||
return {
|
||||
period: { startDate, endDate },
|
||||
...totals,
|
||||
noiseReductionPercent: Math.round(noiseReductionPercent * 100) / 100,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Worker Jobs
|
||||
|
||||
#### Storm Monitor Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertStorm/Monitor.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_FIVE_MINUTES } from 'Common/Utils/CronTime';
|
||||
import StormDetector from 'Common/Server/Utils/Alert/StormDetector';
|
||||
import ProjectService from 'Common/Server/Services/ProjectService';
|
||||
|
||||
RunCron(
|
||||
'AlertStorm:Monitor',
|
||||
{ schedule: EVERY_FIVE_MINUTES, runOnStartup: false },
|
||||
async () => {
|
||||
// Get all active projects
|
||||
const projects = await ProjectService.findBy({
|
||||
query: { isBlocked: false },
|
||||
select: { _id: true },
|
||||
limit: 1000,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
for (const project of projects) {
|
||||
try {
|
||||
await StormDetector.processStormDetection(project.id!);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Error processing storm detection for project ${project.id}:`,
|
||||
error
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
#### Daily Metrics Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/NoiseReduction/DailyMetrics.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_DAY_AT_MIDNIGHT } from 'Common/Utils/CronTime';
|
||||
import NoiseReductionAnalytics from 'Common/Server/Utils/Alert/NoiseReductionAnalytics';
|
||||
import ProjectService from 'Common/Server/Services/ProjectService';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
|
||||
RunCron(
|
||||
'NoiseReduction:DailyMetrics',
|
||||
{ schedule: EVERY_DAY_AT_MIDNIGHT, runOnStartup: false },
|
||||
async () => {
|
||||
// Calculate metrics for yesterday
|
||||
const yesterday = OneUptimeDate.addRemoveDays(
|
||||
OneUptimeDate.getCurrentDate(),
|
||||
-1
|
||||
);
|
||||
|
||||
const projects = await ProjectService.findBy({
|
||||
query: { isBlocked: false },
|
||||
select: { _id: true },
|
||||
limit: 1000,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
for (const project of projects) {
|
||||
try {
|
||||
await NoiseReductionAnalytics.calculateDailyMetrics(
|
||||
project.id!,
|
||||
yesterday
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Error calculating metrics for project ${project.id}:`,
|
||||
error
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(`Calculated daily noise reduction metrics for ${projects.length} projects`);
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Storm Detection
|
||||
- [ ] Create StormDetector utility
|
||||
- [ ] Create AlertStormEventService
|
||||
- [ ] Implement storm detection algorithm
|
||||
- [ ] Create storm monitor worker job
|
||||
|
||||
### Phase 2: Notifications
|
||||
- [ ] Storm start notification
|
||||
- [ ] Storm end notification
|
||||
- [ ] Admin notification integration
|
||||
|
||||
### Phase 3: Analytics
|
||||
- [ ] Create NoiseReductionAnalytics utility
|
||||
- [ ] Create NoiseReductionMetricService
|
||||
- [ ] Implement daily metrics calculation
|
||||
- [ ] Create daily metrics worker job
|
||||
|
||||
### Phase 4: Testing
|
||||
- [ ] Unit tests for StormDetector
|
||||
- [ ] Unit tests for NoiseReductionAnalytics
|
||||
- [ ] Integration tests for worker jobs
|
||||
222
Docs/Plan/AlertStormDetection/3-API.md
Normal file
222
Docs/Plan/AlertStormDetection/3-API.md
Normal file
@@ -0,0 +1,222 @@
|
||||
# API Design for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Storm Detection and Noise Reduction Analytics.
|
||||
|
||||
## Storm Events API
|
||||
|
||||
### Get Current Storm Status
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm/status
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"isStorm": true,
|
||||
"severity": "storm",
|
||||
"currentRate": 150,
|
||||
"normalRate": 30,
|
||||
"multiplier": 5.0,
|
||||
"affectedMonitors": [
|
||||
{ "monitorId": "mon-1", "monitorName": "mysql-prod", "alertCount": 45 },
|
||||
{ "monitorId": "mon-2", "monitorName": "api-gateway", "alertCount": 32 }
|
||||
],
|
||||
"activeStormEvent": {
|
||||
"_id": "storm-event-1",
|
||||
"startedAt": "2026-01-20T10:00:00Z",
|
||||
"peakAlertRate": 180,
|
||||
"durationMinutes": 45
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### List Storm Events
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm-event
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `status` | string | Filter by status (active, resolved) |
|
||||
| `startedAt` | DateRange | Filter by start date |
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "storm-1",
|
||||
"status": "resolved",
|
||||
"severity": "critical",
|
||||
"startedAt": "2026-01-19T14:00:00Z",
|
||||
"endedAt": "2026-01-19T15:30:00Z",
|
||||
"durationMinutes": 90,
|
||||
"peakAlertRate": 250,
|
||||
"normalAlertRate": 30,
|
||||
"multiplier": 8.33,
|
||||
"totalAlertsInStorm": 450,
|
||||
"affectedMonitors": [...]
|
||||
}
|
||||
],
|
||||
"count": 15
|
||||
}
|
||||
```
|
||||
|
||||
### Get Storm Event Details
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm-event/{eventId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Noise Reduction Analytics API
|
||||
|
||||
### Get Noise Reduction Summary
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/noise-reduction/summary
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"period": {
|
||||
"startDate": "2026-01-13T00:00:00Z",
|
||||
"endDate": "2026-01-20T00:00:00Z"
|
||||
},
|
||||
"totalAlertTriggers": 10000,
|
||||
"alertsCreated": 3500,
|
||||
"deduplicated": 4000,
|
||||
"suppressed": 2500,
|
||||
"grouped": 1500,
|
||||
"notificationsSent": 2000,
|
||||
"noiseReductionPercent": 65.0,
|
||||
"breakdown": {
|
||||
"byDeduplication": 40.0,
|
||||
"bySuppression": 25.0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Get Daily Metrics
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/noise-reduction/daily
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"date": "2026-01-20",
|
||||
"totalAlertTriggers": 1500,
|
||||
"alertsCreated": 500,
|
||||
"deduplicated": 600,
|
||||
"suppressed": 400,
|
||||
"alertsGrouped": 200,
|
||||
"episodesCreated": 15,
|
||||
"noiseReductionPercent": 66.67
|
||||
},
|
||||
{
|
||||
"date": "2026-01-19",
|
||||
"totalAlertTriggers": 1200,
|
||||
"alertsCreated": 450
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Get Top Noise Sources
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/noise-reduction/top-sources
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"byMonitor": [
|
||||
{ "monitorId": "mon-1", "monitorName": "mysql-prod", "alertCount": 500, "duplicateCount": 300 },
|
||||
{ "monitorId": "mon-2", "monitorName": "api-gateway", "alertCount": 350, "duplicateCount": 150 }
|
||||
],
|
||||
"bySeverity": [
|
||||
{ "severityId": "sev-1", "severityName": "Warning", "alertCount": 600 },
|
||||
{ "severityId": "sev-2", "severityName": "Critical", "alertCount": 200 }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Storm Configuration API
|
||||
|
||||
### Get Storm Config
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-storm/config
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"stormThreshold": 3,
|
||||
"criticalThreshold": 5,
|
||||
"minimumAlertRate": 10,
|
||||
"baselineHours": 24,
|
||||
"enableEmergencySuppression": false,
|
||||
"notifyOnStormStart": true,
|
||||
"notifyOnStormEnd": true
|
||||
}
|
||||
```
|
||||
|
||||
### Update Storm Config
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-storm/config
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Storm API
|
||||
- [ ] GET /alert-storm/status
|
||||
- [ ] GET /alert-storm-event (list)
|
||||
- [ ] GET /alert-storm-event/:id
|
||||
- [ ] GET /alert-storm/config
|
||||
- [ ] PUT /alert-storm/config
|
||||
|
||||
### Analytics API
|
||||
- [ ] GET /noise-reduction/summary
|
||||
- [ ] GET /noise-reduction/daily
|
||||
- [ ] GET /noise-reduction/top-sources
|
||||
519
Docs/Plan/AlertStormDetection/4-UI.md
Normal file
519
Docs/Plan/AlertStormDetection/4-UI.md
Normal file
@@ -0,0 +1,519 @@
|
||||
# UI Implementation for Alert Storm Detection
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Storm Detection and Noise Reduction Analytics functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
├── Alerts
|
||||
│ ├── Alerts (existing)
|
||||
│ ├── Episodes (from Grouping plan)
|
||||
│ └── Storm History (NEW)
|
||||
└── Settings
|
||||
└── Alerts
|
||||
├── Alert States (existing)
|
||||
├── Alert Severities (existing)
|
||||
├── Grouping Rules
|
||||
├── Suppression Rules
|
||||
├── Deduplication
|
||||
└── Storm Detection (NEW)
|
||||
|
||||
Analytics (NEW section or add to existing)
|
||||
└── Noise Reduction Dashboard (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Storm Detection Settings Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertStormDetection.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-storm-detection`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Alert Storm Detection │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Alert Storm Detection identifies when alert volume spikes abnormally above │ │
|
||||
│ │ historical baselines. This helps identify major incidents and prevent alert │ │
|
||||
│ │ fatigue during outages. │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ DETECTION THRESHOLDS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Storm Threshold (multiplier) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 3 │ x normal rate │
|
||||
│ └──────────┘ │
|
||||
│ Alert volume must exceed this multiplier to be considered a storm. │
|
||||
│ │
|
||||
│ Critical Storm Threshold (multiplier) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 5 │ x normal rate │
|
||||
│ └──────────┘ │
|
||||
│ Storms exceeding this multiplier are marked as critical. │
|
||||
│ │
|
||||
│ Minimum Alert Rate │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 10 │ alerts per hour │
|
||||
│ └──────────┘ │
|
||||
│ Minimum baseline rate required before storm detection activates. │
|
||||
│ │
|
||||
│ Baseline Period │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 24 │ hours │
|
||||
│ └──────────┘ │
|
||||
│ Historical period used to calculate normal alert rate. │
|
||||
│ │
|
||||
│ NOTIFICATIONS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ☑ Notify when storm starts │
|
||||
│ ☑ Notify when storm ends │
|
||||
│ ☐ Enable emergency suppression during storms │
|
||||
│ ⚠️ This will automatically suppress non-critical alerts during storms │
|
||||
│ │
|
||||
│ [Save Changes] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Storm History Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/StormHistory.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/storm-history`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts > Storm History │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 🔴 ACTIVE STORM Started: 45 minutes ago │ │
|
||||
│ │ │ │
|
||||
│ │ Current Rate: 150 alerts/hour (5x normal) │ │
|
||||
│ │ Peak Rate: 180 alerts/hour │ │
|
||||
│ │ Affected Monitors: 12 │ │
|
||||
│ │ │ │
|
||||
│ │ [View Details] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ STORM EVENTS [Filters ▼] │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌──────────┬──────────┬───────────┬──────────┬──────────┬──────────┬─────────────┐ │
|
||||
│ │ Status │ Severity │ Started │ Duration │ Peak │ Alerts │ Monitors │ │
|
||||
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
|
||||
│ │ 🔴 Active│ Critical │ Today │ 45m │ 180/hr │ 450 │ 12 │ │
|
||||
│ │ │ │ 10:15 AM │ │ (6x) │ │ │ │
|
||||
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
|
||||
│ │ ✅ Resv'd│ Storm │ Yesterday │ 1h 30m │ 120/hr │ 280 │ 8 │ │
|
||||
│ │ │ │ 2:30 PM │ │ (4x) │ │ │ │
|
||||
│ ├──────────┼──────────┼───────────┼──────────┼──────────┼──────────┼─────────────┤ │
|
||||
│ │ ✅ Resv'd│ Critical │ Jan 18 │ 2h 15m │ 250/hr │ 620 │ 15 │ │
|
||||
│ │ │ │ 8:00 AM │ │ (8.3x) │ │ │ │
|
||||
│ └──────────┴──────────┴───────────┴──────────┴──────────┴──────────┴─────────────┘ │
|
||||
│ │
|
||||
│ ◄ Previous Page 1 of 3 Next ► │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Storm Event Detail Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/StormEventDetail.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/storm-history/:stormEventId`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Storm History > Storm Event #storm-123 │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────────────────────────────────────┬─────────────────────────────────────────┐ │
|
||||
│ │ STORM SUMMARY │ ALERT VOLUME TIMELINE │ │
|
||||
│ │ │ │ │
|
||||
│ │ Status: 🔴 Active │ 250 ─┬───────────────────────────── │ │
|
||||
│ │ Severity: Critical │ │ ╭─────╮ │ │
|
||||
│ │ Started: Jan 20, 2026 10:15 AM │ 200 ─┤ ╭╯ ╰╮ │ │
|
||||
│ │ Duration: 45 minutes (ongoing) │ │ ╭╯ ╰╮ │ │
|
||||
│ │ │ 150 ─┤ ╭╯ Peak ╰─current │ │
|
||||
│ │ Peak Alert Rate: 180/hour │ │ ╭╯ 180/hr │ │
|
||||
│ │ Normal Rate: 30/hour │ 100 ─┤ ╭╯ │ │
|
||||
│ │ Multiplier: 6x │ │ ╭╯ │ │
|
||||
│ │ │ 50 ─┤──╯ │ │
|
||||
│ │ Total Alerts: 450 │ │ baseline: 30/hr ─ ─ ─ ─ ─ ─ │ │
|
||||
│ │ Affected Monitors: 12 │ 0 ─┴───────────────────────────── │ │
|
||||
│ │ │ 10:00 10:15 10:30 10:45 │ │
|
||||
│ └───────────────────────────────────────┴─────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ TOP ALERTING MONITORS │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────┬──────────┬──────────────────────────────────┐ │
|
||||
│ │ Monitor │ Alerts │ Distribution │ │
|
||||
│ ├──────────────────────────────────┼──────────┼──────────────────────────────────┤ │
|
||||
│ │ 🖥️ mysql-prod-01 │ 85 │ ██████████████████░░ 19% │ │
|
||||
│ │ 🖥️ api-gateway-main │ 72 │ ███████████████░░░░░ 16% │ │
|
||||
│ │ 🖥️ redis-cluster-a │ 58 │ ████████████░░░░░░░░ 13% │ │
|
||||
│ │ 🖥️ postgres-replica-02 │ 45 │ ██████████░░░░░░░░░░ 10% │ │
|
||||
│ │ 🖥️ load-balancer-east │ 38 │ ████████░░░░░░░░░░░░ 8% │ │
|
||||
│ └──────────────────────────────────┴──────────┴──────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ALERTS IN THIS STORM [View All] │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────────┬──────────┬──────────────────────┐ │
|
||||
│ │ ID │ Title │ Monitor │ Time │ │
|
||||
│ ├───────┼──────────────────────────────────────┼──────────┼──────────────────────┤ │
|
||||
│ │ #1234 │ Connection timeout │ mysql-01 │ 10:45 AM │ │
|
||||
│ │ #1233 │ Response time exceeded threshold │ api-gw │ 10:44 AM │ │
|
||||
│ │ #1232 │ Memory usage critical │ redis-a │ 10:43 AM │ │
|
||||
│ └───────┴──────────────────────────────────────┴──────────┴──────────────────────┘ │
|
||||
│ │
|
||||
│ Showing 3 of 450 alerts │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Noise Reduction Analytics Dashboard
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Analytics/NoiseReductionDashboard.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/analytics/noise-reduction`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Analytics > Noise Reduction [Last 7 Days ▼] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ NOISE REDUCTION OVERVIEW │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
|
||||
│ │ Alert Triggers │ │ Alerts Created │ │ Noise Reduced │ │ Notifications │ │
|
||||
│ │ │ │ │ │ │ │ │ │
|
||||
│ │ 10,000 │ │ 3,500 │ │ 65% │ │ 2,000 │ │
|
||||
│ │ total │ │ created │ │ reduction │ │ sent │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
|
||||
│ │
|
||||
│ REDUCTION BREAKDOWN │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ │ Total Alert Triggers: 10,000 │ │
|
||||
│ │ ├─────────────────────────────────────────────────────────────────────────────│ │
|
||||
│ │ │ Deduplicated │ Suppressed │ Grouped │ Created │ │
|
||||
│ │ │ 4,000 (40%) │ 2,500 (25%) │ 1,500 (15%)│ 3,500 (35%) │ │
|
||||
│ │ │ ████████████████████ │ ████████████ │ ███████ │ ██████████████ │ │
|
||||
│ │ └─────────────────────────────────────────────────────────────────────────────│ │
|
||||
│ │ │ │
|
||||
│ │ Legend: │ │
|
||||
│ │ ■ Deduplicated - Merged with existing alerts │ │
|
||||
│ │ ■ Suppressed - Blocked by suppression rules │ │
|
||||
│ │ ■ Grouped - Added to existing episodes (reduced notifications) │ │
|
||||
│ │ ■ Created - New unique alerts │ │
|
||||
│ │ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ DAILY TREND │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 2000 ─┬───────────────────────────────────────────────────────────────── │ │
|
||||
│ │ │ │ │
|
||||
│ │ 1500 ─┤ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ │ │
|
||||
│ │ │ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓ │ │
|
||||
│ │ 1000 ─┤ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓ │ │
|
||||
│ │ │ ▓▓▓▓░░ ▓▓▓▓ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
|
||||
│ │ 500 ─┤ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
|
||||
│ │ │ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ ▓▓▓▓░░ │ │
|
||||
│ │ 0 ─┴────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ Jan 14 Jan 15 Jan 16 Jan 17 Jan 18 Jan 19 Jan 20 │ │
|
||||
│ │ │ │
|
||||
│ │ ▓ Triggers ░ Created Line: Reduction % │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ TOP NOISE SOURCES │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ By Monitor By Severity │
|
||||
│ ┌────────────────────────────────────┐ ┌────────────────────────────────────────┐ │
|
||||
│ │ 1. mysql-prod 500 alerts │ │ 1. Warning 600 alerts │ │
|
||||
│ │ ████████████████████ (300 dupe)│ │ ████████████████████████ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ 2. api-gateway 350 alerts │ │ 2. High 400 alerts │ │
|
||||
│ │ ██████████████ (150 dupe) │ │ ████████████████ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ 3. redis-cluster 280 alerts │ │ 3. Critical 200 alerts │ │
|
||||
│ │ ███████████ (180 dupe) │ │ ████████ │ │
|
||||
│ └────────────────────────────────────┘ └────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. StormStatusBanner
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/StormStatusBanner.tsx`
|
||||
|
||||
Global banner that appears when a storm is active.
|
||||
|
||||
```typescript
|
||||
interface StormStatusBannerProps {
|
||||
stormEvent: AlertStormEvent | null;
|
||||
onDismiss?: () => void;
|
||||
onViewDetails?: () => void;
|
||||
}
|
||||
```
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ⚠️ ALERT STORM DETECTED - 150 alerts/hour (5x normal) - 12 monitors affected │
|
||||
│ [View Details] [Dismiss] │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 2. StormSeverityBadge
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/StormSeverityBadge.tsx`
|
||||
|
||||
Badge showing storm severity level.
|
||||
|
||||
```typescript
|
||||
interface StormSeverityBadgeProps {
|
||||
severity: 'normal' | 'elevated' | 'storm' | 'critical';
|
||||
showLabel?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 3. AlertVolumeChart
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/AlertVolumeChart.tsx`
|
||||
|
||||
Line chart showing alert volume over time.
|
||||
|
||||
```typescript
|
||||
interface AlertVolumeChartProps {
|
||||
data: Array<{
|
||||
timestamp: Date;
|
||||
alertCount: number;
|
||||
baseline?: number;
|
||||
}>;
|
||||
showBaseline?: boolean;
|
||||
highlightStormPeriods?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 4. NoiseReductionSummaryCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/NoiseReductionSummaryCard.tsx`
|
||||
|
||||
Card showing noise reduction summary statistics.
|
||||
|
||||
```typescript
|
||||
interface NoiseReductionSummaryCardProps {
|
||||
totalTriggers: number;
|
||||
alertsCreated: number;
|
||||
deduplicated: number;
|
||||
suppressed: number;
|
||||
grouped: number;
|
||||
reductionPercent: number;
|
||||
}
|
||||
```
|
||||
|
||||
### 5. NoiseReductionBreakdownChart
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/NoiseReductionBreakdownChart.tsx`
|
||||
|
||||
Stacked bar or donut chart showing reduction breakdown.
|
||||
|
||||
```typescript
|
||||
interface NoiseReductionBreakdownChartProps {
|
||||
deduplicated: number;
|
||||
suppressed: number;
|
||||
grouped: number;
|
||||
created: number;
|
||||
}
|
||||
```
|
||||
|
||||
### 6. TopNoiseSourcesTable
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/TopNoiseSourcesTable.tsx`
|
||||
|
||||
Table showing top noise-generating monitors or services.
|
||||
|
||||
```typescript
|
||||
interface TopNoiseSourcesTableProps {
|
||||
sources: Array<{
|
||||
id: string;
|
||||
name: string;
|
||||
alertCount: number;
|
||||
duplicateCount: number;
|
||||
}>;
|
||||
groupBy: 'monitor' | 'severity' | 'service';
|
||||
}
|
||||
```
|
||||
|
||||
### 7. DailyMetricsChart
|
||||
|
||||
**File:** `/Dashboard/src/Components/Analytics/DailyMetricsChart.tsx`
|
||||
|
||||
Bar chart showing daily noise reduction metrics.
|
||||
|
||||
```typescript
|
||||
interface DailyMetricsChartProps {
|
||||
data: Array<{
|
||||
date: string;
|
||||
totalTriggers: number;
|
||||
alertsCreated: number;
|
||||
reductionPercent: number;
|
||||
}>;
|
||||
}
|
||||
```
|
||||
|
||||
### 8. AffectedMonitorsTable
|
||||
|
||||
**File:** `/Dashboard/src/Components/Storm/AffectedMonitorsTable.tsx`
|
||||
|
||||
Table showing monitors contributing to a storm.
|
||||
|
||||
```typescript
|
||||
interface AffectedMonitorsTableProps {
|
||||
monitors: Array<{
|
||||
monitorId: string;
|
||||
monitorName: string;
|
||||
alertCount: number;
|
||||
percentage: number;
|
||||
}>;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to route configuration:
|
||||
|
||||
```typescript
|
||||
// Storm Detection Settings
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-storm-detection',
|
||||
component: AlertStormDetectionSettingsPage,
|
||||
}
|
||||
|
||||
// Storm History
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/storm-history',
|
||||
component: StormHistoryPage,
|
||||
}
|
||||
|
||||
// Storm Event Detail
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/storm-history/:stormEventId',
|
||||
component: StormEventDetailPage,
|
||||
}
|
||||
|
||||
// Noise Reduction Analytics
|
||||
{
|
||||
path: '/dashboard/:projectId/analytics/noise-reduction',
|
||||
component: NoiseReductionDashboardPage,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Global Integration
|
||||
|
||||
### Dashboard Header Integration
|
||||
|
||||
Add storm status indicator to main dashboard header:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ OneUptime [Projects ▼] Alerts Monitors Status Pages 🔴 Storm Active │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Alerts Page Integration
|
||||
|
||||
Add storm status banner above alerts list when storm is active:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ⚠️ Alert Storm Active - Click to view details │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [Normal alerts table continues below...] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Storm Detection settings page
|
||||
- [ ] Storm History list page
|
||||
- [ ] Storm Event detail page
|
||||
- [ ] Noise Reduction Analytics dashboard
|
||||
|
||||
### Components
|
||||
- [ ] StormStatusBanner
|
||||
- [ ] StormSeverityBadge
|
||||
- [ ] AlertVolumeChart
|
||||
- [ ] NoiseReductionSummaryCard
|
||||
- [ ] NoiseReductionBreakdownChart
|
||||
- [ ] TopNoiseSourcesTable
|
||||
- [ ] DailyMetricsChart
|
||||
- [ ] AffectedMonitorsTable
|
||||
|
||||
### Global Integrations
|
||||
- [ ] Add storm indicator to dashboard header
|
||||
- [ ] Add storm banner to Alerts page
|
||||
- [ ] Add sidebar navigation items
|
||||
|
||||
### Styling
|
||||
- [ ] Storm severity color scheme (yellow/orange/red)
|
||||
- [ ] Chart styling for analytics
|
||||
- [ ] Banner animation styles
|
||||
- [ ] Responsive layouts
|
||||
|
||||
### Data Fetching
|
||||
- [ ] Storm status polling (every 30 seconds when on dashboard)
|
||||
- [ ] Storm events API integration
|
||||
- [ ] Noise reduction metrics API integration
|
||||
- [ ] WebSocket support for real-time storm updates (optional)
|
||||
159
Docs/Plan/AlertStormDetection/README.md
Normal file
159
Docs/Plan/AlertStormDetection/README.md
Normal file
@@ -0,0 +1,159 @@
|
||||
# Alert Storm Detection Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Storm Detection and Analytics functionality for OneUptime. This feature detects when alert volume spikes abnormally and provides noise reduction analytics.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and storm detector |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is Alert Storm Detection?
|
||||
|
||||
Alert Storm Detection identifies when the rate of incoming alerts significantly exceeds normal patterns. This helps operators understand when something unusual is happening and optionally enables automatic suppression during storms.
|
||||
|
||||
### Storm Detection Logic
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Storm Detection Algorithm │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
Current Hour Historical Average Storm Check
|
||||
┌─────────────────┐ ┌─────────────────────┐ ┌─────────────┐
|
||||
│ │ │ │ │ │
|
||||
│ 150 alerts │ vs │ 30 alerts/hour │ = │ 5x normal │
|
||||
│ (this hour) │ │ (last 24h avg) │ │ = STORM! │
|
||||
│ │ │ │ │ │
|
||||
└─────────────────┘ └─────────────────────┘ └─────────────┘
|
||||
```
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Storm Detection** - Identify abnormal alert spikes
|
||||
2. **Historical Analysis** - Compare against baseline patterns
|
||||
3. **Storm Alerts** - Notify admins when storm detected
|
||||
4. **Emergency Suppression** - Optional auto-suppression during storms
|
||||
5. **Noise Reduction Analytics** - Track overall noise reduction metrics
|
||||
6. **Top Alerting Sources** - Identify which monitors/services cause most noise
|
||||
|
||||
### Storm Thresholds
|
||||
|
||||
| Level | Multiplier | Description |
|
||||
|-------|------------|-------------|
|
||||
| Normal | < 2x | Within normal variance |
|
||||
| Elevated | 2x - 3x | Higher than usual |
|
||||
| Storm | 3x - 5x | Significant spike |
|
||||
| Critical Storm | > 5x | Major incident likely |
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an SRE, I want to be notified when an alert storm starts
|
||||
so that I know something significant is happening.
|
||||
|
||||
As an operator, I want to see which monitors are causing the most alerts
|
||||
so that I can prioritize investigation.
|
||||
|
||||
As a team lead, I want to see noise reduction metrics
|
||||
so that I can measure the effectiveness of our alert tuning.
|
||||
|
||||
As an admin, I want to enable emergency suppression during storms
|
||||
so that my team isn't overwhelmed during major incidents.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Storm Detection Core (Week 1)
|
||||
|
||||
- [ ] Create AlertStormEvent model
|
||||
- [ ] Implement StormDetector service
|
||||
- [ ] Create storm monitoring worker job
|
||||
- [ ] Add storm detection settings
|
||||
|
||||
### Phase 2: Storm Notifications (Week 2)
|
||||
|
||||
- [ ] Storm start/end notifications
|
||||
- [ ] Top alerting monitors identification
|
||||
- [ ] Storm event timeline
|
||||
- [ ] Admin notifications
|
||||
|
||||
### Phase 3: Noise Reduction Analytics (Week 3)
|
||||
|
||||
- [ ] Create NoiseReductionMetric model
|
||||
- [ ] Daily metrics calculation job
|
||||
- [ ] Deduplication statistics
|
||||
- [ ] Suppression statistics
|
||||
- [ ] Grouping statistics
|
||||
|
||||
### Phase 4: UI Dashboard (Week 4)
|
||||
|
||||
- [ ] Storm status banner
|
||||
- [ ] Noise reduction dashboard
|
||||
- [ ] Alert volume charts
|
||||
- [ ] Top alerting sources view
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Storm Detection Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌───────────────────┐
|
||||
│ Worker Job │
|
||||
│ (Every 5 min) │
|
||||
└─────────┬─────────┘
|
||||
│
|
||||
▼
|
||||
┌───────────────────────────────────────────────────────────────────────────┐
|
||||
│ StormDetector.checkStatus() │
|
||||
├───────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────────┐ │
|
||||
│ │ Get current │──▶│ Get historical │──▶│ Calculate multiplier │ │
|
||||
│ │ hour count │ │ average │ │ current / historical │ │
|
||||
│ └─────────────────┘ └─────────────────┘ └─────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌─────────┴─────────┐ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────────┐ ┌─────────────────┐│
|
||||
│ │ multiplier < 3 │ │ multiplier >= 3 ││
|
||||
│ │ = Normal │ │ = STORM ││
|
||||
│ └─────────────────┘ └────────┬────────┘│
|
||||
│ │ │
|
||||
└──────────────────────────────────────────────────────────────────┼─────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ Storm Actions: │
|
||||
│ - Create AlertStormEvent│
|
||||
│ - Notify admins │
|
||||
│ - Show banner │
|
||||
│ - Optional: auto-suppress│
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Storm detection accuracy | > 95% |
|
||||
| Detection latency | < 5 minutes |
|
||||
| False positive rate | < 5% |
|
||||
| Noise reduction visibility | 100% of projects |
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Alert Grouping Plan](../AlertGrouping/README.md)
|
||||
- [Alert Suppression Plan](../AlertSuppression/README.md)
|
||||
- [Alert Deduplication Plan](../AlertDeduplication/README.md)
|
||||
1117
Docs/Plan/AlertSuppression/1-DataModels.md
Normal file
1117
Docs/Plan/AlertSuppression/1-DataModels.md
Normal file
File diff suppressed because it is too large
Load Diff
909
Docs/Plan/AlertSuppression/2-Backend.md
Normal file
909
Docs/Plan/AlertSuppression/2-Backend.md
Normal file
@@ -0,0 +1,909 @@
|
||||
# Backend Implementation for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the backend services and components required for Alert Suppression functionality.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Suppression Evaluation Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────┐
|
||||
│ Alert Trigger │
|
||||
│ (Monitor/Manual) │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ SuppressionEngine.evaluate() │
|
||||
├──────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────┐ │
|
||||
│ │ 1. Get enabled │ │
|
||||
│ │ rules │ │
|
||||
│ └────────┬────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 2. For each rule (sorted by priority): │ │
|
||||
│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │
|
||||
│ │ │ Match Criteria│─▶│ Check if Rule │─▶│ Apply Action │ │ │
|
||||
│ │ │ Evaluation │ │ is Active │ │ │ │ │
|
||||
│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌───────────────┼───────────────┐ │ │
|
||||
│ │ ▼ ▼ ▼ │ │
|
||||
│ │ ┌─────────────────┐ ┌──────────────┐ ┌─────────────────┐ │ │
|
||||
│ │ │ Maintenance │ │ Condition │ │ Rate Limit │ │ │
|
||||
│ │ │ Window Active? │ │ Met? │ │ Exceeded? │ │ │
|
||||
│ │ └─────────────────┘ └──────────────┘ └─────────────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────┐ │
|
||||
│ │ 3. Determine │ │
|
||||
│ │ final action │ │
|
||||
│ └─────────────────┘ │
|
||||
│ │
|
||||
└──────────────────────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────┴──────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌────────┐ ┌────────┐
|
||||
│SUPPRESS│ │ ALLOW │
|
||||
└────────┘ └────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Services to Create
|
||||
|
||||
### 1. AlertSuppressionRuleService
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertSuppressionRuleService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import AlertSuppressionRule, {
|
||||
SuppressionRuleType,
|
||||
} from '../Models/DatabaseModels/AlertSuppressionRule';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import SortOrder from 'Common/Types/BaseDatabase/SortOrder';
|
||||
|
||||
export class Service extends DatabaseService<AlertSuppressionRule> {
|
||||
public constructor() {
|
||||
super(AlertSuppressionRule);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all enabled rules for a project, sorted by priority
|
||||
*/
|
||||
public async getEnabledRulesForProject(
|
||||
projectId: ObjectID
|
||||
): Promise<Array<AlertSuppressionRule>> {
|
||||
return await this.findBy({
|
||||
query: {
|
||||
projectId,
|
||||
isEnabled: true,
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
name: true,
|
||||
type: true,
|
||||
matchCriteria: true,
|
||||
maintenanceWindow: true,
|
||||
condition: true,
|
||||
rateLimit: true,
|
||||
action: true,
|
||||
suppressionGroupId: true,
|
||||
priority: true,
|
||||
},
|
||||
sort: { priority: SortOrder.Ascending },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active maintenance windows
|
||||
*/
|
||||
public async getActiveMaintenanceWindows(
|
||||
projectId: ObjectID
|
||||
): Promise<Array<AlertSuppressionRule>> {
|
||||
const rules = await this.getEnabledRulesForProject(projectId);
|
||||
|
||||
return rules.filter((rule) => {
|
||||
if (rule.type !== SuppressionRuleType.MaintenanceWindow) {
|
||||
return false;
|
||||
}
|
||||
return this.isMaintenanceWindowActive(rule);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a maintenance window is currently active
|
||||
*/
|
||||
private isMaintenanceWindowActive(rule: AlertSuppressionRule): boolean {
|
||||
const window = rule.maintenanceWindow;
|
||||
if (!window) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
||||
if (window.isRecurring && window.recurrenceRule) {
|
||||
return this.evaluateRecurrence(window, now);
|
||||
}
|
||||
|
||||
return now >= window.startTime && now <= window.endTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate recurrence rule (RRULE format)
|
||||
*/
|
||||
private evaluateRecurrence(
|
||||
window: MaintenanceWindowConfig,
|
||||
now: Date
|
||||
): boolean {
|
||||
// Use rrule library for parsing
|
||||
// This is a simplified implementation
|
||||
try {
|
||||
const RRule = require('rrule').RRule;
|
||||
const rule = RRule.fromString(window.recurrenceRule!);
|
||||
|
||||
// Get next occurrence
|
||||
const nextOccurrence = rule.after(
|
||||
new Date(now.getTime() - 24 * 60 * 60 * 1000), // Look back 24h
|
||||
true
|
||||
);
|
||||
|
||||
if (!nextOccurrence) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate window duration
|
||||
const duration = window.endTime.getTime() - window.startTime.getTime();
|
||||
const occurrenceEnd = new Date(nextOccurrence.getTime() + duration);
|
||||
|
||||
return now >= nextOccurrence && now <= occurrenceEnd;
|
||||
} catch (error) {
|
||||
logger.error('Error evaluating recurrence rule:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment suppressed count for a rule
|
||||
*/
|
||||
public async incrementSuppressedCount(ruleId: ObjectID): Promise<void> {
|
||||
await this.updateOneById({
|
||||
id: ruleId,
|
||||
data: {
|
||||
suppressedCount: QueryHelper.increment(1),
|
||||
lastTriggeredAt: new Date(),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. SuppressionEngine
|
||||
|
||||
**File Location:** `/Common/Server/Utils/Alert/SuppressionEngine.ts`
|
||||
|
||||
```typescript
|
||||
import Alert from '../../Models/DatabaseModels/Alert';
|
||||
import AlertSuppressionRule, {
|
||||
SuppressionRuleType,
|
||||
SuppressionAction,
|
||||
SuppressionMatchCriteria,
|
||||
RateLimitConfig,
|
||||
} from '../../Models/DatabaseModels/AlertSuppressionRule';
|
||||
import AlertSuppressionRuleService from '../../Services/AlertSuppressionRuleService';
|
||||
import AlertThrottleStateService from '../../Services/AlertThrottleStateService';
|
||||
import SuppressedAlertLogService from '../../Services/SuppressedAlertLogService';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
import OneUptimeDate from 'Common/Types/Date';
|
||||
|
||||
export interface SuppressionResult {
|
||||
shouldSuppress: boolean;
|
||||
action: SuppressionAction | 'none';
|
||||
matchedRules: Array<AlertSuppressionRule>;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export default class SuppressionEngine {
|
||||
/**
|
||||
* Evaluate all suppression rules for an alert
|
||||
*/
|
||||
public static async evaluate(
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID
|
||||
): Promise<SuppressionResult> {
|
||||
// Get all enabled suppression rules
|
||||
const rules = await AlertSuppressionRuleService.getEnabledRulesForProject(
|
||||
projectId
|
||||
);
|
||||
|
||||
if (rules.length === 0) {
|
||||
return {
|
||||
shouldSuppress: false,
|
||||
action: 'none',
|
||||
matchedRules: [],
|
||||
};
|
||||
}
|
||||
|
||||
const matchedRules: Array<AlertSuppressionRule> = [];
|
||||
let shouldSuppress = false;
|
||||
let finalAction: SuppressionAction | 'none' = 'none';
|
||||
let reason = '';
|
||||
|
||||
// Evaluate each rule in priority order
|
||||
for (const rule of rules) {
|
||||
// Check if alert matches rule criteria
|
||||
if (!await this.matchesCriteria(alertData, rule.matchCriteria)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if rule is currently active
|
||||
const isActive = await this.isRuleActive(rule, alertData, projectId);
|
||||
|
||||
if (isActive) {
|
||||
matchedRules.push(rule);
|
||||
shouldSuppress = true;
|
||||
|
||||
// Determine action (most restrictive wins)
|
||||
if (rule.action === SuppressionAction.Both || finalAction === SuppressionAction.Both) {
|
||||
finalAction = SuppressionAction.Both;
|
||||
} else if (rule.action === SuppressionAction.SuppressCreation) {
|
||||
finalAction = SuppressionAction.SuppressCreation;
|
||||
} else if (finalAction !== SuppressionAction.SuppressCreation) {
|
||||
finalAction = SuppressionAction.SuppressNotifications;
|
||||
}
|
||||
|
||||
// Build reason string
|
||||
reason = this.buildSuppressionReason(rule);
|
||||
|
||||
// If suppressing creation, no need to check more rules
|
||||
if (finalAction === SuppressionAction.SuppressCreation ||
|
||||
finalAction === SuppressionAction.Both) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Log suppression if applicable
|
||||
if (shouldSuppress && matchedRules.length > 0) {
|
||||
await this.logSuppression(alertData, matchedRules[0]!, projectId, reason, finalAction);
|
||||
}
|
||||
|
||||
return {
|
||||
shouldSuppress,
|
||||
action: finalAction,
|
||||
matchedRules,
|
||||
reason,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if alert matches rule criteria
|
||||
*/
|
||||
private static async matchesCriteria(
|
||||
alertData: Partial<Alert>,
|
||||
criteria?: SuppressionMatchCriteria
|
||||
): Promise<boolean> {
|
||||
if (!criteria || criteria.matchAll) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check severity
|
||||
if (criteria.severityIds?.length) {
|
||||
const alertSeverityId = alertData.alertSeverityId?.toString();
|
||||
if (!alertSeverityId || !criteria.severityIds.includes(alertSeverityId)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check monitors
|
||||
if (criteria.monitorIds?.length) {
|
||||
const alertMonitorId = alertData.monitorId?.toString();
|
||||
if (!alertMonitorId || !criteria.monitorIds.includes(alertMonitorId)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check labels
|
||||
if (criteria.labelIds?.length) {
|
||||
const alertLabelIds = (alertData.labels || []).map((l) =>
|
||||
l.id?.toString() || l._id?.toString()
|
||||
);
|
||||
const hasMatchingLabel = criteria.labelIds.some((id) =>
|
||||
alertLabelIds.includes(id)
|
||||
);
|
||||
if (!hasMatchingLabel) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check title pattern
|
||||
if (criteria.titlePattern) {
|
||||
try {
|
||||
const regex = new RegExp(criteria.titlePattern, 'i');
|
||||
if (!regex.test(alertData.title || '')) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
// Invalid regex, skip this check
|
||||
}
|
||||
}
|
||||
|
||||
// Check description pattern
|
||||
if (criteria.descriptionPattern) {
|
||||
try {
|
||||
const regex = new RegExp(criteria.descriptionPattern, 'i');
|
||||
if (!regex.test(alertData.description || '')) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
// Invalid regex, skip this check
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if rule is currently active
|
||||
*/
|
||||
private static async isRuleActive(
|
||||
rule: AlertSuppressionRule,
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID
|
||||
): Promise<boolean> {
|
||||
switch (rule.type) {
|
||||
case SuppressionRuleType.MaintenanceWindow:
|
||||
return this.isMaintenanceWindowActive(rule);
|
||||
|
||||
case SuppressionRuleType.ConditionBased:
|
||||
return await this.isConditionMet(rule, projectId);
|
||||
|
||||
case SuppressionRuleType.RateLimit:
|
||||
return await this.isRateLimitExceeded(rule, alertData, projectId);
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if maintenance window is active
|
||||
*/
|
||||
private static isMaintenanceWindowActive(rule: AlertSuppressionRule): boolean {
|
||||
const window = rule.maintenanceWindow;
|
||||
if (!window) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
||||
if (window.isRecurring && window.recurrenceRule) {
|
||||
return this.evaluateRecurrence(window, now);
|
||||
}
|
||||
|
||||
return now >= window.startTime && now <= window.endTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate recurrence rule
|
||||
*/
|
||||
private static evaluateRecurrence(
|
||||
window: MaintenanceWindowConfig,
|
||||
now: Date
|
||||
): boolean {
|
||||
try {
|
||||
const RRule = require('rrule').RRule;
|
||||
const rule = RRule.fromString(window.recurrenceRule!);
|
||||
|
||||
const lookbackTime = new Date(now.getTime() - 24 * 60 * 60 * 1000);
|
||||
const nextOccurrence = rule.after(lookbackTime, true);
|
||||
|
||||
if (!nextOccurrence) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const duration = window.endTime.getTime() - window.startTime.getTime();
|
||||
const occurrenceEnd = new Date(nextOccurrence.getTime() + duration);
|
||||
|
||||
return now >= nextOccurrence && now <= occurrenceEnd;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if condition is met
|
||||
*/
|
||||
private static async isConditionMet(
|
||||
rule: AlertSuppressionRule,
|
||||
projectId: ObjectID
|
||||
): Promise<boolean> {
|
||||
const condition = rule.condition;
|
||||
if (!condition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if another alert is active with specific labels
|
||||
if (condition.whenAlertActiveWithLabelIds?.length) {
|
||||
const activeAlert = await AlertService.findOneBy({
|
||||
query: {
|
||||
projectId,
|
||||
labels: QueryHelper.any(condition.whenAlertActiveWithLabelIds),
|
||||
currentAlertStateId: QueryHelper.notEquals(
|
||||
await AlertStateService.getResolvedStateId(projectId)
|
||||
),
|
||||
},
|
||||
select: { _id: true },
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (activeAlert) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if monitor is in specific state
|
||||
if (condition.whenMonitorInStateIds?.length) {
|
||||
// Implementation depends on monitor state tracking
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if rate limit is exceeded
|
||||
*/
|
||||
private static async isRateLimitExceeded(
|
||||
rule: AlertSuppressionRule,
|
||||
alertData: Partial<Alert>,
|
||||
projectId: ObjectID
|
||||
): Promise<boolean> {
|
||||
const rateLimit = rule.rateLimit;
|
||||
if (!rateLimit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compute throttle key
|
||||
const throttleKey = this.computeThrottleKey(rule, alertData);
|
||||
|
||||
// Get or create throttle state
|
||||
let state = await AlertThrottleStateService.findOneBy({
|
||||
query: {
|
||||
throttleKey,
|
||||
suppressionRuleId: rule.id!,
|
||||
windowExpiresAt: QueryHelper.greaterThan(new Date()),
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
alertCount: true,
|
||||
isThrottling: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
if (!state) {
|
||||
// Create new throttle state
|
||||
const now = new Date();
|
||||
const windowExpires = OneUptimeDate.addRemoveMinutes(
|
||||
now,
|
||||
rateLimit.timeWindowMinutes
|
||||
);
|
||||
|
||||
await AlertThrottleStateService.create({
|
||||
data: {
|
||||
projectId,
|
||||
throttleKey,
|
||||
suppressionRuleId: rule.id!,
|
||||
alertCount: 1,
|
||||
firstAlertAt: now,
|
||||
lastAlertAt: now,
|
||||
windowExpiresAt: windowExpires,
|
||||
isThrottling: false,
|
||||
} as AlertThrottleState,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Update throttle state
|
||||
const newCount = (state.alertCount || 0) + 1;
|
||||
const shouldThrottle = newCount > rateLimit.maxAlerts;
|
||||
|
||||
await AlertThrottleStateService.updateOneById({
|
||||
id: state.id!,
|
||||
data: {
|
||||
alertCount: newCount,
|
||||
lastAlertAt: new Date(),
|
||||
isThrottling: shouldThrottle,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return shouldThrottle;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute throttle key from rule and alert data
|
||||
*/
|
||||
private static computeThrottleKey(
|
||||
rule: AlertSuppressionRule,
|
||||
alertData: Partial<Alert>
|
||||
): string {
|
||||
const parts: Array<string> = [`rule:${rule.id?.toString()}`];
|
||||
|
||||
const groupByFields = rule.rateLimit?.groupByFields || [];
|
||||
|
||||
for (const field of groupByFields) {
|
||||
switch (field) {
|
||||
case 'monitorId':
|
||||
parts.push(`monitor:${alertData.monitorId?.toString() || 'null'}`);
|
||||
break;
|
||||
case 'alertSeverityId':
|
||||
case 'severity':
|
||||
parts.push(`severity:${alertData.alertSeverityId?.toString() || 'null'}`);
|
||||
break;
|
||||
case 'title':
|
||||
parts.push(`title:${alertData.title || 'null'}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('|');
|
||||
}
|
||||
|
||||
/**
|
||||
* Build suppression reason string
|
||||
*/
|
||||
private static buildSuppressionReason(rule: AlertSuppressionRule): string {
|
||||
switch (rule.type) {
|
||||
case SuppressionRuleType.MaintenanceWindow:
|
||||
return `Suppressed by maintenance window: ${rule.name}`;
|
||||
case SuppressionRuleType.ConditionBased:
|
||||
return `Suppressed by condition: ${rule.name}`;
|
||||
case SuppressionRuleType.RateLimit:
|
||||
return `Suppressed by rate limit: ${rule.name} (max ${rule.rateLimit?.maxAlerts} per ${rule.rateLimit?.timeWindowMinutes} min)`;
|
||||
default:
|
||||
return `Suppressed by rule: ${rule.name}`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Log suppressed alert for audit trail
|
||||
*/
|
||||
private static async logSuppression(
|
||||
alertData: Partial<Alert>,
|
||||
rule: AlertSuppressionRule,
|
||||
projectId: ObjectID,
|
||||
reason: string,
|
||||
action: SuppressionAction | 'none'
|
||||
): Promise<void> {
|
||||
await SuppressedAlertLogService.create({
|
||||
data: {
|
||||
projectId,
|
||||
suppressionRuleId: rule.id,
|
||||
alertData: alertData as object,
|
||||
alertTitle: alertData.title,
|
||||
suppressionReason: reason,
|
||||
action: action as SuppressionAction,
|
||||
suppressedAt: new Date(),
|
||||
monitorId: alertData.monitorId,
|
||||
} as SuppressedAlertLog,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Increment rule counter
|
||||
await AlertSuppressionRuleService.incrementSuppressedCount(rule.id!);
|
||||
}
|
||||
}
|
||||
|
||||
// Import services at end to avoid circular dependencies
|
||||
import AlertService from '../../Services/AlertService';
|
||||
import AlertStateService from '../../Services/AlertStateService';
|
||||
import AlertThrottleState from '../../Models/DatabaseModels/AlertThrottleState';
|
||||
import SuppressedAlertLog from '../../Models/DatabaseModels/SuppressedAlertLog';
|
||||
import QueryHelper from '../../Types/Database/QueryHelper';
|
||||
import logger from '../../Utils/Logger';
|
||||
import { MaintenanceWindowConfig } from '../../Models/DatabaseModels/AlertSuppressionRule';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Integration with AlertService
|
||||
|
||||
Modify `/Common/Server/Services/AlertService.ts`:
|
||||
|
||||
```typescript
|
||||
// Add import
|
||||
import SuppressionEngine from '../Utils/Alert/SuppressionEngine';
|
||||
|
||||
// In onBeforeCreate() method, add suppression check:
|
||||
protected async onBeforeCreate(
|
||||
createBy: CreateBy<Alert>
|
||||
): Promise<OnCreate<Alert>> {
|
||||
// ... existing code ...
|
||||
|
||||
// Check suppression rules
|
||||
const suppressionResult = await SuppressionEngine.evaluate(
|
||||
createBy.data,
|
||||
createBy.data.projectId!
|
||||
);
|
||||
|
||||
if (suppressionResult.shouldSuppress) {
|
||||
if (suppressionResult.action === SuppressionAction.SuppressCreation ||
|
||||
suppressionResult.action === SuppressionAction.Both) {
|
||||
// Prevent alert creation
|
||||
throw new SuppressedAlertException(
|
||||
suppressionResult.reason || 'Alert suppressed by rule'
|
||||
);
|
||||
}
|
||||
|
||||
// Mark for notification suppression
|
||||
createBy.data.notificationsSuppressed = true;
|
||||
createBy.data.suppressedByRuleId = suppressionResult.matchedRules[0]?.id;
|
||||
}
|
||||
|
||||
// ... rest of existing code ...
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. SuppressedAlertLogService
|
||||
|
||||
**File Location:** `/Common/Server/Services/SuppressedAlertLogService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import SuppressedAlertLog from '../Models/DatabaseModels/SuppressedAlertLog';
|
||||
import ObjectID from 'Common/Types/ObjectID';
|
||||
|
||||
export class Service extends DatabaseService<SuppressedAlertLog> {
|
||||
public constructor() {
|
||||
super(SuppressedAlertLog);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get suppressed alerts for a rule
|
||||
*/
|
||||
public async getSuppressedByRule(
|
||||
ruleId: ObjectID,
|
||||
limit: number = 100
|
||||
): Promise<Array<SuppressedAlertLog>> {
|
||||
return await this.findBy({
|
||||
query: { suppressionRuleId: ruleId },
|
||||
select: {
|
||||
_id: true,
|
||||
alertTitle: true,
|
||||
suppressionReason: true,
|
||||
action: true,
|
||||
suppressedAt: true,
|
||||
monitorId: true,
|
||||
},
|
||||
sort: { suppressedAt: SortOrder.Descending },
|
||||
limit,
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get suppression statistics for a project
|
||||
*/
|
||||
public async getStatistics(
|
||||
projectId: ObjectID,
|
||||
startDate: Date,
|
||||
endDate: Date
|
||||
): Promise<{
|
||||
totalSuppressed: number;
|
||||
byRule: Array<{ ruleId: string; count: number }>;
|
||||
byAction: Array<{ action: string; count: number }>;
|
||||
}> {
|
||||
const totalSuppressed = await this.countBy({
|
||||
query: {
|
||||
projectId,
|
||||
suppressedAt: QueryHelper.between(startDate, endDate),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
// Aggregate by rule
|
||||
const byRule = await this.aggregate({
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
projectId: projectId.toString(),
|
||||
suppressedAt: { $gte: startDate, $lte: endDate },
|
||||
},
|
||||
},
|
||||
{
|
||||
$group: {
|
||||
_id: '$suppressionRuleId',
|
||||
count: { $sum: 1 },
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Aggregate by action
|
||||
const byAction = await this.aggregate({
|
||||
pipeline: [
|
||||
{
|
||||
$match: {
|
||||
projectId: projectId.toString(),
|
||||
suppressedAt: { $gte: startDate, $lte: endDate },
|
||||
},
|
||||
},
|
||||
{
|
||||
$group: {
|
||||
_id: '$action',
|
||||
count: { $sum: 1 },
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return {
|
||||
totalSuppressed,
|
||||
byRule: byRule.map((r) => ({ ruleId: r._id, count: r.count })),
|
||||
byAction: byAction.map((a) => ({ action: a._id, count: a.count })),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. AlertThrottleStateService
|
||||
|
||||
**File Location:** `/Common/Server/Services/AlertThrottleStateService.ts`
|
||||
|
||||
```typescript
|
||||
import DatabaseService from './DatabaseService';
|
||||
import AlertThrottleState from '../Models/DatabaseModels/AlertThrottleState';
|
||||
|
||||
export class Service extends DatabaseService<AlertThrottleState> {
|
||||
public constructor() {
|
||||
super(AlertThrottleState);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up expired throttle states
|
||||
*/
|
||||
public async cleanupExpired(): Promise<number> {
|
||||
const result = await this.deleteBy({
|
||||
query: {
|
||||
windowExpiresAt: QueryHelper.lessThan(new Date()),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
export default new Service();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Worker Jobs
|
||||
|
||||
### 1. ThrottleStateCleanup Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertSuppression/ThrottleStateCleanup.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_HOUR } from 'Common/Utils/CronTime';
|
||||
import AlertThrottleStateService from 'Common/Server/Services/AlertThrottleStateService';
|
||||
|
||||
RunCron(
|
||||
'AlertSuppression:ThrottleStateCleanup',
|
||||
{ schedule: EVERY_HOUR, runOnStartup: false },
|
||||
async () => {
|
||||
const deletedCount = await AlertThrottleStateService.cleanupExpired();
|
||||
|
||||
if (deletedCount > 0) {
|
||||
logger.info(`Cleaned up ${deletedCount} expired throttle states`);
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
### 2. MaintenanceWindowNotification Job
|
||||
|
||||
**File Location:** `/Worker/Jobs/AlertSuppression/MaintenanceWindowNotification.ts`
|
||||
|
||||
```typescript
|
||||
import RunCron from '../../Utils/Cron';
|
||||
import { EVERY_MINUTE } from 'Common/Utils/CronTime';
|
||||
import AlertSuppressionRuleService from 'Common/Server/Services/AlertSuppressionRuleService';
|
||||
import { SuppressionRuleType } from 'Common/Models/DatabaseModels/AlertSuppressionRule';
|
||||
|
||||
RunCron(
|
||||
'AlertSuppression:MaintenanceWindowNotification',
|
||||
{ schedule: EVERY_MINUTE, runOnStartup: false },
|
||||
async () => {
|
||||
// Find maintenance windows starting in next 15 minutes
|
||||
const upcomingWindows = await AlertSuppressionRuleService.findBy({
|
||||
query: {
|
||||
type: SuppressionRuleType.MaintenanceWindow,
|
||||
isEnabled: true,
|
||||
},
|
||||
select: {
|
||||
_id: true,
|
||||
projectId: true,
|
||||
name: true,
|
||||
maintenanceWindow: true,
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
|
||||
const now = new Date();
|
||||
const fifteenMinutesFromNow = new Date(now.getTime() + 15 * 60 * 1000);
|
||||
|
||||
for (const rule of upcomingWindows) {
|
||||
const window = rule.maintenanceWindow;
|
||||
if (!window) continue;
|
||||
|
||||
// Check if window starts within next 15 minutes
|
||||
if (window.startTime > now && window.startTime <= fifteenMinutesFromNow) {
|
||||
// Send notification about upcoming maintenance window
|
||||
await NotificationService.sendMaintenanceWindowNotification({
|
||||
projectId: rule.projectId!,
|
||||
ruleName: rule.name!,
|
||||
startsAt: window.startTime,
|
||||
endsAt: window.endTime,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Core Services
|
||||
- [ ] Create AlertSuppressionRuleService
|
||||
- [ ] Create AlertSuppressionGroupService
|
||||
- [ ] Create SuppressedAlertLogService
|
||||
- [ ] Create AlertThrottleStateService
|
||||
- [ ] Create SuppressionEngine
|
||||
|
||||
### Phase 2: Integration
|
||||
- [ ] Modify AlertService.onBeforeCreate()
|
||||
- [ ] Add SuppressedAlertException
|
||||
- [ ] Add notification suppression field to Alert
|
||||
|
||||
### Phase 3: Worker Jobs
|
||||
- [ ] Create ThrottleStateCleanup job
|
||||
- [ ] Create MaintenanceWindowNotification job
|
||||
- [ ] Register jobs in worker startup
|
||||
|
||||
### Phase 4: Testing
|
||||
- [ ] Unit tests for SuppressionEngine
|
||||
- [ ] Unit tests for criteria matching
|
||||
- [ ] Unit tests for rate limiting
|
||||
- [ ] Integration tests for full suppression flow
|
||||
499
Docs/Plan/AlertSuppression/3-API.md
Normal file
499
Docs/Plan/AlertSuppression/3-API.md
Normal file
@@ -0,0 +1,499 @@
|
||||
# API Design for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the REST API endpoints for Alert Suppression functionality.
|
||||
|
||||
## Base URLs
|
||||
|
||||
```
|
||||
/api/project/{projectId}/alert-suppression-rule
|
||||
/api/project/{projectId}/alert-suppression-group
|
||||
/api/project/{projectId}/suppressed-alert-log
|
||||
/api/project/{projectId}/maintenance-window
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Suppression Rules API
|
||||
|
||||
### List Suppression Rules
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-rule
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `type` | string | Filter by rule type (maintenance_window, condition_based, rate_limit) |
|
||||
| `isEnabled` | boolean | Filter by enabled status |
|
||||
| `suppressionGroupId` | ObjectID | Filter by suppression group |
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "rule-id-1",
|
||||
"name": "Nightly Maintenance Window",
|
||||
"description": "Suppress alerts during nightly deployments",
|
||||
"type": "maintenance_window",
|
||||
"isEnabled": true,
|
||||
"priority": 1,
|
||||
"matchCriteria": {
|
||||
"matchAll": true
|
||||
},
|
||||
"maintenanceWindow": {
|
||||
"startTime": "2026-01-20T02:00:00Z",
|
||||
"endTime": "2026-01-20T04:00:00Z",
|
||||
"timezone": "America/Los_Angeles",
|
||||
"isRecurring": true,
|
||||
"recurrenceRule": "FREQ=DAILY"
|
||||
},
|
||||
"action": "both",
|
||||
"suppressedCount": 156,
|
||||
"lastTriggeredAt": "2026-01-20T02:15:00Z"
|
||||
},
|
||||
{
|
||||
"_id": "rule-id-2",
|
||||
"name": "Rate Limit - 10/hour per monitor",
|
||||
"type": "rate_limit",
|
||||
"isEnabled": true,
|
||||
"priority": 2,
|
||||
"matchCriteria": {},
|
||||
"rateLimit": {
|
||||
"maxAlerts": 10,
|
||||
"timeWindowMinutes": 60,
|
||||
"groupByFields": ["monitorId"]
|
||||
},
|
||||
"action": "suppress_creation",
|
||||
"suppressedCount": 523
|
||||
}
|
||||
],
|
||||
"count": 5,
|
||||
"skip": 0,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Get Suppression Rule
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Create Suppression Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-rule
|
||||
```
|
||||
|
||||
**Request Body (Maintenance Window):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Weekend Maintenance",
|
||||
"description": "Suppress alerts during weekend maintenance",
|
||||
"type": "maintenance_window",
|
||||
"isEnabled": true,
|
||||
"priority": 1,
|
||||
"matchCriteria": {
|
||||
"labelIds": ["production-label-id"]
|
||||
},
|
||||
"maintenanceWindow": {
|
||||
"startTime": "2026-01-25T00:00:00Z",
|
||||
"endTime": "2026-01-25T06:00:00Z",
|
||||
"timezone": "America/New_York",
|
||||
"isRecurring": true,
|
||||
"recurrenceRule": "FREQ=WEEKLY;BYDAY=SA,SU"
|
||||
},
|
||||
"action": "both"
|
||||
}
|
||||
```
|
||||
|
||||
**Request Body (Rate Limit):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Alert Storm Protection",
|
||||
"description": "Limit alerts to 20 per hour per monitor",
|
||||
"type": "rate_limit",
|
||||
"isEnabled": true,
|
||||
"priority": 10,
|
||||
"matchCriteria": {
|
||||
"severityIds": ["warning-id", "info-id"]
|
||||
},
|
||||
"rateLimit": {
|
||||
"maxAlerts": 20,
|
||||
"timeWindowMinutes": 60,
|
||||
"groupByFields": ["monitorId"]
|
||||
},
|
||||
"action": "suppress_creation"
|
||||
}
|
||||
```
|
||||
|
||||
**Request Body (Condition-Based):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Suppress Staging Alerts",
|
||||
"description": "Suppress notifications for staging environment",
|
||||
"type": "condition_based",
|
||||
"isEnabled": true,
|
||||
"priority": 5,
|
||||
"matchCriteria": {
|
||||
"labelIds": ["staging-label-id"]
|
||||
},
|
||||
"condition": {},
|
||||
"action": "suppress_notifications"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Update Suppression Rule
|
||||
|
||||
```http
|
||||
PUT /api/project/{projectId}/alert-suppression-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Delete Suppression Rule
|
||||
|
||||
```http
|
||||
DELETE /api/project/{projectId}/alert-suppression-rule/{ruleId}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Enable/Disable Rule
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/enable
|
||||
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/disable
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Test Suppression Rule
|
||||
|
||||
Test which alerts would be suppressed by a rule.
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-rule/{ruleId}/test
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"alertIds": ["alert-id-1", "alert-id-2", "alert-id-3"]
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"alertId": "alert-id-1",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"wouldSuppress": true,
|
||||
"action": "both",
|
||||
"reason": "Matches criteria and maintenance window is active"
|
||||
},
|
||||
{
|
||||
"alertId": "alert-id-2",
|
||||
"alertTitle": "API latency high",
|
||||
"wouldSuppress": false,
|
||||
"reason": "Does not match severity criteria"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Maintenance Windows API
|
||||
|
||||
Convenience endpoints for maintenance windows specifically.
|
||||
|
||||
### List Active Maintenance Windows
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/maintenance-window/active
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "rule-id-1",
|
||||
"name": "Nightly Maintenance",
|
||||
"startedAt": "2026-01-20T02:00:00Z",
|
||||
"endsAt": "2026-01-20T04:00:00Z",
|
||||
"remainingMinutes": 45,
|
||||
"matchCriteria": { "matchAll": true }
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### List Upcoming Maintenance Windows
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/maintenance-window/upcoming
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `hours` | number | Look ahead hours (default: 24) |
|
||||
|
||||
---
|
||||
|
||||
### Quick Create Maintenance Window
|
||||
|
||||
Simplified endpoint for creating one-time maintenance windows.
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/maintenance-window/quick
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Emergency Deployment",
|
||||
"durationMinutes": 60,
|
||||
"matchCriteria": {
|
||||
"monitorIds": ["monitor-1", "monitor-2"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Creates a maintenance window starting immediately.
|
||||
|
||||
---
|
||||
|
||||
## Suppression Groups API
|
||||
|
||||
### List Suppression Groups
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-group
|
||||
```
|
||||
|
||||
### Create Suppression Group
|
||||
|
||||
```http
|
||||
POST /api/project/{projectId}/alert-suppression-group
|
||||
```
|
||||
|
||||
**Request Body:**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "Database Alerts",
|
||||
"description": "Group for database-related suppression rules",
|
||||
"throttleMinutes": 30
|
||||
}
|
||||
```
|
||||
|
||||
### Get Group with Rules
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/alert-suppression-group/{groupId}/rules
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Suppressed Alert Log API
|
||||
|
||||
### List Suppressed Alerts
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/suppressed-alert-log
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `suppressionRuleId` | ObjectID | Filter by rule |
|
||||
| `monitorId` | ObjectID | Filter by monitor |
|
||||
| `action` | string | Filter by action |
|
||||
| `suppressedAt` | DateRange | Filter by date |
|
||||
| `limit` | number | Results per page |
|
||||
| `skip` | number | Pagination offset |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"_id": "log-id-1",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"suppressionRule": {
|
||||
"_id": "rule-id",
|
||||
"name": "Nightly Maintenance"
|
||||
},
|
||||
"suppressionReason": "Suppressed by maintenance window: Nightly Maintenance",
|
||||
"action": "both",
|
||||
"suppressedAt": "2026-01-20T02:15:00Z",
|
||||
"monitor": {
|
||||
"_id": "monitor-id",
|
||||
"name": "MySQL Production"
|
||||
},
|
||||
"alertData": {
|
||||
"title": "MySQL connection timeout",
|
||||
"description": "Connection to MySQL timed out after 30s",
|
||||
"severity": "High"
|
||||
}
|
||||
}
|
||||
],
|
||||
"count": 156,
|
||||
"skip": 0,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
### Get Suppression Statistics
|
||||
|
||||
```http
|
||||
GET /api/project/{projectId}/suppressed-alert-log/statistics
|
||||
```
|
||||
|
||||
**Query Parameters:**
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `startDate` | Date | Start of period |
|
||||
| `endDate` | Date | End of period |
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"period": {
|
||||
"startDate": "2026-01-13T00:00:00Z",
|
||||
"endDate": "2026-01-20T00:00:00Z"
|
||||
},
|
||||
"totalSuppressed": 1234,
|
||||
"byRule": [
|
||||
{ "ruleId": "rule-1", "ruleName": "Nightly Maintenance", "count": 523 },
|
||||
{ "ruleId": "rule-2", "ruleName": "Rate Limit", "count": 711 }
|
||||
],
|
||||
"byAction": [
|
||||
{ "action": "suppress_creation", "count": 890 },
|
||||
{ "action": "suppress_notifications", "count": 244 },
|
||||
{ "action": "both", "count": 100 }
|
||||
],
|
||||
"byDay": [
|
||||
{ "date": "2026-01-13", "count": 156 },
|
||||
{ "date": "2026-01-14", "count": 178 },
|
||||
{ "date": "2026-01-15", "count": 145 }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Permissions
|
||||
|
||||
| Endpoint | Required Permission |
|
||||
|----------|---------------------|
|
||||
| GET suppression rules | `ProjectMember` |
|
||||
| Create/Update/Delete rules | `ProjectAdmin` |
|
||||
| Enable/Disable rules | `ProjectAdmin` |
|
||||
| GET suppressed logs | `ProjectMember` |
|
||||
| GET statistics | `ProjectMember` |
|
||||
|
||||
---
|
||||
|
||||
## Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "INVALID_RECURRENCE_RULE",
|
||||
"message": "Invalid RRULE format: FREQ=INVALID"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Error Codes:**
|
||||
|
||||
| Code | Description |
|
||||
|------|-------------|
|
||||
| `INVALID_RECURRENCE_RULE` | Invalid RRULE format |
|
||||
| `INVALID_TIME_WINDOW` | End time before start time |
|
||||
| `RULE_NOT_FOUND` | Suppression rule doesn't exist |
|
||||
| `CANNOT_DELETE_ACTIVE_WINDOW` | Cannot delete currently active maintenance window |
|
||||
| `OVERLAPPING_WINDOWS` | Maintenance windows overlap (warning only) |
|
||||
|
||||
---
|
||||
|
||||
## Webhooks
|
||||
|
||||
### Suppression Events
|
||||
|
||||
Configure webhooks to receive suppression events:
|
||||
|
||||
```json
|
||||
{
|
||||
"event": "alert.suppressed",
|
||||
"timestamp": "2026-01-20T02:15:00Z",
|
||||
"data": {
|
||||
"projectId": "project-id",
|
||||
"suppressionRuleId": "rule-id",
|
||||
"suppressionRuleName": "Nightly Maintenance",
|
||||
"alertTitle": "MySQL connection timeout",
|
||||
"action": "both",
|
||||
"reason": "Maintenance window active"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Suppression Rule API
|
||||
- [ ] GET /alert-suppression-rule (list)
|
||||
- [ ] GET /alert-suppression-rule/:id (details)
|
||||
- [ ] POST /alert-suppression-rule (create)
|
||||
- [ ] PUT /alert-suppression-rule/:id (update)
|
||||
- [ ] DELETE /alert-suppression-rule/:id (delete)
|
||||
- [ ] POST /alert-suppression-rule/:id/enable
|
||||
- [ ] POST /alert-suppression-rule/:id/disable
|
||||
- [ ] POST /alert-suppression-rule/:id/test
|
||||
|
||||
### Maintenance Window API
|
||||
- [ ] GET /maintenance-window/active
|
||||
- [ ] GET /maintenance-window/upcoming
|
||||
- [ ] POST /maintenance-window/quick
|
||||
|
||||
### Suppression Group API
|
||||
- [ ] GET /alert-suppression-group (list)
|
||||
- [ ] POST /alert-suppression-group (create)
|
||||
- [ ] GET /alert-suppression-group/:id/rules
|
||||
|
||||
### Suppressed Log API
|
||||
- [ ] GET /suppressed-alert-log (list)
|
||||
- [ ] GET /suppressed-alert-log/statistics
|
||||
464
Docs/Plan/AlertSuppression/4-UI.md
Normal file
464
Docs/Plan/AlertSuppression/4-UI.md
Normal file
@@ -0,0 +1,464 @@
|
||||
# UI Implementation for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document details the frontend components and pages required for Alert Suppression functionality.
|
||||
|
||||
## Navigation Structure
|
||||
|
||||
```
|
||||
Dashboard
|
||||
├── Alerts
|
||||
│ ├── All Alerts (existing)
|
||||
│ ├── Episodes
|
||||
│ └── Suppressed Alerts (NEW)
|
||||
└── Settings
|
||||
├── Alerts
|
||||
│ ├── Alert States (existing)
|
||||
│ ├── Alert Severities (existing)
|
||||
│ ├── Grouping Rules
|
||||
│ └── Suppression Rules (NEW)
|
||||
└── Maintenance
|
||||
└── Maintenance Windows (NEW)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages to Create
|
||||
|
||||
### 1. Suppression Rules List Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertSuppressionRules.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-suppression-rules`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Suppression Rules [+ Create Rule] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Suppression rules prevent alert creation or notifications based on conditions. │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [All] [Maintenance Windows] [Rate Limits] [Condition-Based] │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Nightly Maintenance Window Priority: 1 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 🕐 Type: Maintenance Window │ │
|
||||
│ │ 📅 Schedule: Daily 2:00 AM - 4:00 AM PST │ │
|
||||
│ │ 🎯 Affects: All monitors │ │
|
||||
│ │ 🚫 Action: Suppress creation and notifications │ │
|
||||
│ │ 📊 Suppressed: 523 alerts │ │
|
||||
│ │ [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ✅ Rate Limit: 10 alerts/hour per monitor Priority: 2 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 📈 Type: Rate Limit │ │
|
||||
│ │ ⚡ Limit: 10 alerts per 60 minutes │ │
|
||||
│ │ 📦 Group by: Monitor │ │
|
||||
│ │ 🚫 Action: Suppress creation after threshold │ │
|
||||
│ │ 📊 Suppressed: 1,247 alerts │ │
|
||||
│ │ [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ❌ Staging Environment (Disabled) Priority: 3 │ │
|
||||
│ │ ────────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 🔧 Type: Condition-Based │ │
|
||||
│ │ 🎯 Matches: Labels contain "staging" │ │
|
||||
│ │ 🚫 Action: Suppress notifications only │ │
|
||||
│ │ [Enable] [Edit] [Delete]│ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Create/Edit Suppression Rule Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/AlertSuppressionRuleView/Index.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/alert-suppression-rules/create`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Create Suppression Rule │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ BASIC INFORMATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Rule Name * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Nightly Maintenance Window │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Description │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Suppress all alerts during nightly deployment window │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Rule Type * │
|
||||
│ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ │
|
||||
│ │ ● Maintenance Window │ │ ○ Condition-Based │ │ ○ Rate Limit │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ Time-based │ │ Attribute-based │ │ Threshold-based │ │
|
||||
│ │ suppression │ │ suppression │ │ suppression │ │
|
||||
│ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ │
|
||||
│ │
|
||||
│ MATCHING CRITERIA │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ Which alerts should this rule apply to? │
|
||||
│ │
|
||||
│ ○ All alerts │
|
||||
│ ● Alerts matching specific criteria │
|
||||
│ │
|
||||
│ Severities (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [Critical ×] [High ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Monitors (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Select monitors... [Browse] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Labels (optional) │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [production ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ MAINTENANCE WINDOW │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Start Date & Time * End Date & Time * │
|
||||
│ ┌────────────────────────────────┐ ┌────────────────────────────────┐ │
|
||||
│ │ 2026-01-20 02:00 AM │ │ 2026-01-20 04:00 AM │ │
|
||||
│ └────────────────────────────────┘ └────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Timezone * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ America/Los_Angeles (PST) [▼] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ☑ Recurring window │
|
||||
│ │
|
||||
│ Repeat * │
|
||||
│ ┌───────────────┐ │
|
||||
│ │ Daily [▼] │ │
|
||||
│ └───────────────┘ │
|
||||
│ │
|
||||
│ ○ Never ends │
|
||||
│ ● Ends on: [2026-12-31] │
|
||||
│ │
|
||||
│ SUPPRESSION ACTION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ What should happen when this rule matches? * │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ● Suppress both alert creation and notifications (Recommended) │ │
|
||||
│ │ ○ Suppress alert creation only │ │
|
||||
│ │ ○ Suppress notifications only (alert is still created) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Priority (lower = evaluated first) │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 1 │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ [Cancel] [Test Rule] [Save] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Rate Limit Rule Form (Conditional Section)
|
||||
|
||||
When "Rate Limit" is selected as rule type:
|
||||
|
||||
```
|
||||
│ RATE LIMIT CONFIGURATION │
|
||||
│ ───────────────────────────────────────────────────────────────────────────────── │
|
||||
│ │
|
||||
│ Maximum alerts allowed * │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 10 │ alerts │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Time window * │
|
||||
│ ┌──────────┐ │
|
||||
│ │ 60 │ minutes │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Group rate limit by: │
|
||||
│ ☑ Monitor (separate limit per monitor) │
|
||||
│ ☐ Severity (separate limit per severity) │
|
||||
│ ☐ None (global limit) │
|
||||
│ │
|
||||
│ Example: With these settings, each monitor can generate up to 10 alerts per hour. │
|
||||
│ Additional alerts from the same monitor will be suppressed. │
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Suppressed Alerts Log Page
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Alerts/SuppressedAlerts.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/alerts/suppressed`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alerts > Suppressed Alerts │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 📊 Last 7 Days: 1,234 alerts suppressed (saves ~40% notification volume) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Filters: │
|
||||
│ [Rule: All ▼] [Monitor: All ▼] [Action: All ▼] [Date: Last 7 days ▼] │
|
||||
│ │
|
||||
│ ┌───────┬──────────────────────────────────┬───────────────┬──────────┬───────────┐│
|
||||
│ │ Time │ Alert Title │ Rule │ Action │ Monitor ││
|
||||
│ ├───────┼──────────────────────────────────┼───────────────┼──────────┼───────────┤│
|
||||
│ │ 2:15 │ MySQL connection timeout │ Nightly Maint │ Both │ mysql-01 ││
|
||||
│ │ 2:14 │ MySQL connection timeout │ Nightly Maint │ Both │ mysql-01 ││
|
||||
│ │ 2:12 │ API response time > 5s │ Rate Limit │ Creation │ api-gw ││
|
||||
│ │ 2:10 │ Disk space warning │ Rate Limit │ Creation │ web-03 ││
|
||||
│ │ 2:08 │ Memory usage high │ Nightly Maint │ Both │ app-01 ││
|
||||
│ └───────┴──────────────────────────────────┴───────────────┴──────────┴───────────┘│
|
||||
│ │
|
||||
│ [1] [2] [3] ... [Next →] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. Maintenance Windows Calendar View
|
||||
|
||||
**File Location:** `/Dashboard/src/Pages/Settings/MaintenanceWindows.tsx`
|
||||
|
||||
**Route:** `/dashboard/:projectId/settings/maintenance-windows`
|
||||
|
||||
**Wireframe:**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Settings > Maintenance Windows [+ Schedule Maintenance] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ [Calendar View] [List View] │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ January 2026 │ │
|
||||
│ │ ◀ ▶ │ │
|
||||
│ │ ───────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ Sun Mon Tue Wed Thu Fri Sat │ │
|
||||
│ │ ───────────────────────────────────────────────────────────────────────────── │ │
|
||||
│ │ 1 2 3 4 │ │
|
||||
│ │ ┌─────┐ │ │
|
||||
│ │ │2-4AM│ │ │
|
||||
│ │ └─────┘ │ │
|
||||
│ │ 5 6 7 8 9 10 11 │ │
|
||||
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
|
||||
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │ │
|
||||
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
|
||||
│ │ 12 13 14 15 16 17 18 │ │
|
||||
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
|
||||
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │ │
|
||||
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
|
||||
│ │ 19 20 21 22 23 24 25 │ │
|
||||
│ │┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────────┐ │ │
|
||||
│ ││2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ │2-4AM│ ││Weekend │ │ │
|
||||
│ │└─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ ││00:00- │ │ │
|
||||
│ │ ││06:00 │ │ │
|
||||
│ │ │└─────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Legend: │
|
||||
│ ┌─────┐ Nightly Maintenance (Daily 2-4 AM) │
|
||||
│ └─────┘ │
|
||||
│ ┌─────────┐ Weekend Deployment (Sat 00:00-06:00) │
|
||||
│ └─────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 6. Active Maintenance Banner
|
||||
|
||||
Show banner on Alerts page when maintenance window is active.
|
||||
|
||||
**Component:** `/Dashboard/src/Components/Alert/MaintenanceBanner.tsx`
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 🔧 MAINTENANCE ACTIVE: "Nightly Maintenance Window" - Ends in 1h 45m │
|
||||
│ Alerts matching this window will be suppressed. [View Details]│
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 7. Quick Maintenance Modal
|
||||
|
||||
Triggered from Alerts page or Monitor detail page.
|
||||
|
||||
**Component:** `/Dashboard/src/Components/Suppression/QuickMaintenanceModal.tsx`
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Start Maintenance Window [X] │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Quick maintenance window starting now. │
|
||||
│ │
|
||||
│ Name │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Emergency maintenance │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Duration │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ 15 min │ │ 30 min │ │ ● 1 hour │ │ 2 hours │ │
|
||||
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │ Custom: [__] │ minutes │
|
||||
│ └──────────────┘ │
|
||||
│ │
|
||||
│ Apply to │
|
||||
│ ○ All monitors │
|
||||
│ ● Selected monitors: │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ [mysql-production ×] [api-gateway ×] [+ Add] │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ [Cancel] [Start Maintenance] │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Components to Create
|
||||
|
||||
### 1. SuppressionRuleCard
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/SuppressionRuleCard.tsx`
|
||||
|
||||
```typescript
|
||||
interface SuppressionRuleCardProps {
|
||||
rule: AlertSuppressionRule;
|
||||
onEdit: () => void;
|
||||
onDelete: () => void;
|
||||
onToggleEnabled: () => void;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. MaintenanceWindowForm
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/MaintenanceWindowForm.tsx`
|
||||
|
||||
Handles date/time selection, timezone, recurrence configuration.
|
||||
|
||||
### 3. RateLimitForm
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/RateLimitForm.tsx`
|
||||
|
||||
Handles max alerts, time window, group-by field selection.
|
||||
|
||||
### 4. MatchCriteriaBuilder
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/MatchCriteriaBuilder.tsx`
|
||||
|
||||
Reusable component for building match criteria (severities, monitors, labels, patterns).
|
||||
|
||||
### 5. SuppressionActionSelector
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/SuppressionActionSelector.tsx`
|
||||
|
||||
Radio group for selecting suppression action type.
|
||||
|
||||
### 6. MaintenanceCalendar
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/MaintenanceCalendar.tsx`
|
||||
|
||||
Calendar view showing maintenance windows.
|
||||
|
||||
### 7. SuppressedAlertsBadge
|
||||
|
||||
**File:** `/Dashboard/src/Components/Suppression/SuppressedAlertsBadge.tsx`
|
||||
|
||||
Badge showing count of suppressed alerts.
|
||||
|
||||
---
|
||||
|
||||
## Routing Configuration
|
||||
|
||||
Add to route configuration:
|
||||
|
||||
```typescript
|
||||
// Suppression routes
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-suppression-rules',
|
||||
component: AlertSuppressionRulesPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-suppression-rules/create',
|
||||
component: CreateSuppressionRulePage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/alert-suppression-rules/:ruleId',
|
||||
component: SuppressionRuleDetailPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/settings/maintenance-windows',
|
||||
component: MaintenanceWindowsPage,
|
||||
},
|
||||
{
|
||||
path: '/dashboard/:projectId/alerts/suppressed',
|
||||
component: SuppressedAlertsPage,
|
||||
},
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pages
|
||||
- [ ] Suppression rules list page
|
||||
- [ ] Create/edit suppression rule page
|
||||
- [ ] Suppressed alerts log page
|
||||
- [ ] Maintenance windows calendar page
|
||||
|
||||
### Components
|
||||
- [ ] SuppressionRuleCard
|
||||
- [ ] MaintenanceWindowForm
|
||||
- [ ] RateLimitForm
|
||||
- [ ] MatchCriteriaBuilder
|
||||
- [ ] SuppressionActionSelector
|
||||
- [ ] MaintenanceCalendar
|
||||
- [ ] QuickMaintenanceModal
|
||||
- [ ] MaintenanceBanner
|
||||
- [ ] SuppressedAlertsBadge
|
||||
|
||||
### Navigation Updates
|
||||
- [ ] Add sidebar menu items
|
||||
- [ ] Add route configuration
|
||||
- [ ] Add navigation helpers
|
||||
551
Docs/Plan/AlertSuppression/5-Migration.md
Normal file
551
Docs/Plan/AlertSuppression/5-Migration.md
Normal file
@@ -0,0 +1,551 @@
|
||||
# Migration & Rollout Plan for Alert Suppression
|
||||
|
||||
## Overview
|
||||
|
||||
This document outlines the database migrations and rollout strategy for Alert Suppression functionality.
|
||||
|
||||
## Database Migrations
|
||||
|
||||
### Migration 1: Create AlertSuppressionGroup Table
|
||||
|
||||
```typescript
|
||||
import { MigrationInterface, QueryRunner, Table, TableIndex } from 'typeorm';
|
||||
|
||||
export class CreateAlertSuppressionGroup implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertSuppressionGroup',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'description',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'throttleMinutes',
|
||||
type: 'integer',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'throttleUntil',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertSuppressionGroup',
|
||||
new TableIndex({
|
||||
name: 'idx_suppression_group_project',
|
||||
columnNames: ['projectId'],
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertSuppressionGroup');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 2: Create AlertSuppressionRule Table
|
||||
|
||||
```typescript
|
||||
export class CreateAlertSuppressionRule implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertSuppressionRule',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'description',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'type',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'isEnabled',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
},
|
||||
{
|
||||
name: 'matchCriteria',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'maintenanceWindow',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'condition',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'rateLimit',
|
||||
type: 'jsonb',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'action',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
default: "'both'",
|
||||
},
|
||||
{
|
||||
name: 'suppressionGroupId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'priority',
|
||||
type: 'integer',
|
||||
default: 100,
|
||||
},
|
||||
{
|
||||
name: 'suppressedCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
},
|
||||
{
|
||||
name: 'lastTriggeredAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdByUserId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'AlertSuppressionRule',
|
||||
new TableIndex({
|
||||
name: 'idx_suppression_rule_project_enabled',
|
||||
columnNames: ['projectId', 'isEnabled', 'priority'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertSuppressionRule',
|
||||
new TableIndex({
|
||||
name: 'idx_suppression_rule_type',
|
||||
columnNames: ['projectId', 'type', 'isEnabled'],
|
||||
})
|
||||
);
|
||||
|
||||
// Foreign keys
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertSuppressionRule',
|
||||
new TableForeignKey({
|
||||
columnNames: ['projectId'],
|
||||
referencedTableName: 'Project',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'CASCADE',
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createForeignKey(
|
||||
'AlertSuppressionRule',
|
||||
new TableForeignKey({
|
||||
columnNames: ['suppressionGroupId'],
|
||||
referencedTableName: 'AlertSuppressionGroup',
|
||||
referencedColumnNames: ['_id'],
|
||||
onDelete: 'SET NULL',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertSuppressionRule');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 3: Create SuppressedAlertLog Table
|
||||
|
||||
```typescript
|
||||
export class CreateSuppressedAlertLog implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'SuppressedAlertLog',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'suppressionRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'alertData',
|
||||
type: 'jsonb',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'alertTitle',
|
||||
type: 'text',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'suppressionReason',
|
||||
type: 'text',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'action',
|
||||
type: 'varchar',
|
||||
length: '50',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'suppressedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'monitorId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'deletedAt',
|
||||
type: 'timestamp',
|
||||
isNullable: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'SuppressedAlertLog',
|
||||
new TableIndex({
|
||||
name: 'idx_suppressed_log_project_date',
|
||||
columnNames: ['projectId', 'suppressedAt'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'SuppressedAlertLog',
|
||||
new TableIndex({
|
||||
name: 'idx_suppressed_log_rule',
|
||||
columnNames: ['suppressionRuleId', 'suppressedAt'],
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('SuppressedAlertLog');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 4: Create AlertThrottleState Table
|
||||
|
||||
```typescript
|
||||
export class CreateAlertThrottleState implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.createTable(
|
||||
new Table({
|
||||
name: 'AlertThrottleState',
|
||||
columns: [
|
||||
{
|
||||
name: '_id',
|
||||
type: 'uuid',
|
||||
isPrimary: true,
|
||||
default: 'uuid_generate_v4()',
|
||||
},
|
||||
{
|
||||
name: 'projectId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'throttleKey',
|
||||
type: 'varchar',
|
||||
length: '500',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'suppressionRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'alertCount',
|
||||
type: 'integer',
|
||||
default: 0,
|
||||
},
|
||||
{
|
||||
name: 'firstAlertAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'lastAlertAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'windowExpiresAt',
|
||||
type: 'timestamp',
|
||||
isNullable: false,
|
||||
},
|
||||
{
|
||||
name: 'isThrottling',
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
},
|
||||
{
|
||||
name: 'createdAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
{
|
||||
name: 'updatedAt',
|
||||
type: 'timestamp',
|
||||
default: 'CURRENT_TIMESTAMP',
|
||||
},
|
||||
],
|
||||
}),
|
||||
true
|
||||
);
|
||||
|
||||
// Indexes
|
||||
await queryRunner.createIndex(
|
||||
'AlertThrottleState',
|
||||
new TableIndex({
|
||||
name: 'idx_throttle_state_key',
|
||||
columnNames: ['throttleKey', 'windowExpiresAt'],
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.createIndex(
|
||||
'AlertThrottleState',
|
||||
new TableIndex({
|
||||
name: 'idx_throttle_state_unique',
|
||||
columnNames: ['throttleKey', 'suppressionRuleId'],
|
||||
isUnique: true,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropTable('AlertThrottleState');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Migration 5: Add Suppression Fields to Alert Table
|
||||
|
||||
```typescript
|
||||
export class AddSuppressionFieldsToAlert implements MigrationInterface {
|
||||
public async up(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'notificationsSuppressed',
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
})
|
||||
);
|
||||
|
||||
await queryRunner.addColumn(
|
||||
'Alert',
|
||||
new TableColumn({
|
||||
name: 'suppressedByRuleId',
|
||||
type: 'uuid',
|
||||
isNullable: true,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
public async down(queryRunner: QueryRunner): Promise<void> {
|
||||
await queryRunner.dropColumn('Alert', 'suppressedByRuleId');
|
||||
await queryRunner.dropColumn('Alert', 'notificationsSuppressed');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollout Strategy
|
||||
|
||||
### Phase 1: Internal Testing
|
||||
|
||||
**Duration:** 1 week
|
||||
|
||||
- Deploy to staging environment
|
||||
- Create test suppression rules
|
||||
- Verify suppression logic works correctly
|
||||
- Test all three rule types
|
||||
|
||||
### Phase 2: Beta (Opt-in)
|
||||
|
||||
**Duration:** 2 weeks
|
||||
|
||||
- Enable feature flag for early adopters
|
||||
- Collect feedback on UI/UX
|
||||
- Monitor for performance issues
|
||||
- Document common use cases
|
||||
|
||||
### Phase 3: General Availability
|
||||
|
||||
**Duration:** Ongoing
|
||||
|
||||
- Enable for all projects
|
||||
- Default rules disabled
|
||||
- Users opt-in by creating rules
|
||||
|
||||
---
|
||||
|
||||
## Data Retention
|
||||
|
||||
### SuppressedAlertLog Retention
|
||||
|
||||
Suppressed alert logs should be retained for compliance but cleaned up after retention period:
|
||||
|
||||
```typescript
|
||||
// Worker job to clean up old logs
|
||||
RunCron(
|
||||
'SuppressedAlertLog:Cleanup',
|
||||
{ schedule: EVERY_DAY, runOnStartup: false },
|
||||
async () => {
|
||||
const retentionDays = 90; // Configurable per project
|
||||
const cutoffDate = OneUptimeDate.addRemoveDays(
|
||||
OneUptimeDate.getCurrentDate(),
|
||||
-retentionDays
|
||||
);
|
||||
|
||||
await SuppressedAlertLogService.deleteBy({
|
||||
query: {
|
||||
suppressedAt: QueryHelper.lessThan(cutoffDate),
|
||||
},
|
||||
props: { isRoot: true },
|
||||
});
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
### Pre-Migration
|
||||
- [ ] Review migration scripts
|
||||
- [ ] Test on staging
|
||||
- [ ] Backup production database
|
||||
|
||||
### Migration
|
||||
- [ ] Run migrations in order
|
||||
- [ ] Verify table creation
|
||||
- [ ] Verify indexes
|
||||
|
||||
### Post-Migration
|
||||
- [ ] Deploy API changes
|
||||
- [ ] Deploy Dashboard changes
|
||||
- [ ] Deploy Worker jobs
|
||||
- [ ] Enable feature flags
|
||||
|
||||
### Monitoring
|
||||
- [ ] Set up suppression metrics
|
||||
- [ ] Alert on engine errors
|
||||
- [ ] Monitor performance
|
||||
165
Docs/Plan/AlertSuppression/README.md
Normal file
165
Docs/Plan/AlertSuppression/README.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# Alert Suppression Implementation Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This sub-plan details the implementation of Alert Suppression functionality for OneUptime. This feature allows users to suppress alert creation and/or notifications based on configurable rules including maintenance windows, conditions, and rate limits.
|
||||
|
||||
## Documents
|
||||
|
||||
| Document | Description |
|
||||
|----------|-------------|
|
||||
| [1-DataModels.md](./1-DataModels.md) | Database models and schema definitions |
|
||||
| [2-Backend.md](./2-Backend.md) | Backend services and suppression engine |
|
||||
| [3-API.md](./3-API.md) | REST API endpoints |
|
||||
| [4-UI.md](./4-UI.md) | Frontend components and pages |
|
||||
| [5-Migration.md](./5-Migration.md) | Database migrations and rollout |
|
||||
|
||||
## Feature Summary
|
||||
|
||||
### What is Alert Suppression?
|
||||
|
||||
Alert Suppression allows you to temporarily or permanently prevent alerts from being created or notifications from being sent based on configurable rules.
|
||||
|
||||
### Suppression Types
|
||||
|
||||
| Type | Description | Use Case |
|
||||
|------|-------------|----------|
|
||||
| **Maintenance Window** | Time-based suppression | Planned deployments, scheduled maintenance |
|
||||
| **Condition-Based** | Suppress based on alert attributes | Ignore staging alerts, low-priority monitors |
|
||||
| **Rate Limit** | Suppress after threshold exceeded | Prevent alert storms, noise reduction |
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
1. **Maintenance Windows** - Schedule suppression periods (one-time or recurring)
|
||||
2. **Condition Matching** - Suppress alerts matching specific criteria
|
||||
3. **Rate Limiting** - Limit alerts per time window per dimension
|
||||
4. **Suppression Actions** - Choose to suppress creation, notifications, or both
|
||||
5. **Audit Trail** - Track all suppressed alerts for compliance
|
||||
6. **Suppression Groups** - Group related rules for coordinated suppression
|
||||
|
||||
### User Stories
|
||||
|
||||
```
|
||||
As an operator, I want to create a maintenance window
|
||||
so that I don't get alerted during planned deployments.
|
||||
|
||||
As a team lead, I want to suppress notifications for staging alerts
|
||||
so that my team only gets paged for production issues.
|
||||
|
||||
As an SRE, I want to rate-limit alerts per monitor
|
||||
so that a single flapping service doesn't flood my inbox.
|
||||
|
||||
As a compliance officer, I want to see which alerts were suppressed
|
||||
so that I can audit our alert handling procedures.
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Data Models & Core Engine (Week 1-2)
|
||||
|
||||
- [ ] Create AlertSuppressionRule model
|
||||
- [ ] Create AlertSuppressionGroup model
|
||||
- [ ] Create SuppressedAlertLog model
|
||||
- [ ] Implement SuppressionEngine
|
||||
- [ ] Integrate with AlertService
|
||||
|
||||
### Phase 2: Maintenance Windows (Week 3)
|
||||
|
||||
- [ ] Time-based suppression logic
|
||||
- [ ] Recurring schedule support (RRULE)
|
||||
- [ ] Timezone handling
|
||||
- [ ] Calendar UI component
|
||||
|
||||
### Phase 3: Condition & Rate Limiting (Week 4)
|
||||
|
||||
- [ ] Condition-based matching
|
||||
- [ ] Rate limit state tracking
|
||||
- [ ] AlertThrottleState model
|
||||
- [ ] Per-field rate limiting
|
||||
|
||||
### Phase 4: UI Implementation (Week 5-6)
|
||||
|
||||
- [ ] Suppression rules list page
|
||||
- [ ] Create/edit rule forms
|
||||
- [ ] Maintenance window calendar
|
||||
- [ ] Suppressed alerts log view
|
||||
|
||||
### Phase 5: Analytics & Reporting (Week 7)
|
||||
|
||||
- [ ] Suppression metrics dashboard
|
||||
- [ ] Noise reduction statistics
|
||||
- [ ] Audit log export
|
||||
|
||||
## Dependencies
|
||||
|
||||
### Existing Components Used
|
||||
|
||||
- `Alert` model and `AlertService`
|
||||
- `AlertSeverity` and `AlertState` models
|
||||
- `Monitor` and `Label` models
|
||||
- Dashboard ModelTable and ModelForm components
|
||||
- Notification system
|
||||
|
||||
### New Components Created
|
||||
|
||||
- `AlertSuppressionRule` model
|
||||
- `AlertSuppressionGroup` model
|
||||
- `SuppressedAlertLog` model
|
||||
- `AlertThrottleState` model
|
||||
- `SuppressionEngine` service
|
||||
- Suppression UI pages
|
||||
|
||||
## Success Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Suppression rule creation | < 5 minutes |
|
||||
| Rule evaluation latency | < 10ms |
|
||||
| Maintenance window accuracy | 100% (no alerts during window) |
|
||||
| User adoption | 60% of projects with rules |
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Alert Creation Flow │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────────┐
|
||||
│ Alert Trigger │
|
||||
│ (Monitor/Manual) │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ SuppressionEngine │
|
||||
│ .evaluate() │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
┌────────────────┼────────────────┐
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐
|
||||
│ Maintenance │ │ Condition │ │ Rate Limit │
|
||||
│ Window Check │ │ Check │ │ Check │
|
||||
└────────┬────────┘ └──────┬──────┘ └────────┬────────┘
|
||||
│ │ │
|
||||
└────────────────┼────────────────┘
|
||||
│
|
||||
┌─────────┴─────────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ SUPPRESS │ │ ALLOW │
|
||||
│ - Log to audit │ │ - Create alert │
|
||||
│ - Skip creation │ │ - Send notifs │
|
||||
│ or notifs │ │ │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- [Parent Plan: AlertEngine.md](../AlertEngine.md)
|
||||
- [Alert Grouping Plan](../AlertGrouping/README.md)
|
||||
- [PagerDuty Maintenance Windows](https://support.pagerduty.com/docs/maintenance-windows)
|
||||
- [Splunk Alert Suppression](https://docs.splunk.com/Documentation/ITSI)
|
||||
Reference in New Issue
Block a user