From a02018aeb20e264f57e2fea824ac4bbf6cf5061e Mon Sep 17 00:00:00 2001 From: Nawaz Dhandala Date: Mon, 16 Mar 2026 21:23:59 +0000 Subject: [PATCH] refactor: adjust formatting in SelectFieldGenerator test and update roadmap documents for clarity and new features --- CLI/Tests/SelectFieldGenerator.test.ts | 2 +- Internal/Roadmap/Logs.md | 50 ++------------ Internal/Roadmap/Metrics.md | 96 ++++++-------------------- Internal/Roadmap/Traces.md | 45 ++---------- 4 files changed, 33 insertions(+), 160 deletions(-) diff --git a/CLI/Tests/SelectFieldGenerator.test.ts b/CLI/Tests/SelectFieldGenerator.test.ts index 52e471df41..740f2ca4cd 100644 --- a/CLI/Tests/SelectFieldGenerator.test.ts +++ b/CLI/Tests/SelectFieldGenerator.test.ts @@ -6,7 +6,7 @@ import { JSONObject } from "Common/Types/JSON"; * fail when transitive dependencies are not resolvable by ts-jest). */ const AnalyticsTableName: { - readonly Log: "LogItemV2"; + readonly Log: "LogItemV2"; } = { Log: "LogItemV2", } as const; diff --git a/Internal/Roadmap/Logs.md b/Internal/Roadmap/Logs.md index b97fd71dff..678f394f65 100644 --- a/Internal/Roadmap/Logs.md +++ b/Internal/Roadmap/Logs.md @@ -27,6 +27,7 @@ The following features have been implemented and removed from this plan: - **Phase 4.2** - Keyboard Shortcuts (j/k navigation, Enter expand/collapse, Esc close, / focus search, Ctrl+Enter apply filters, ? help) - **Phase 4.3** - Sensitive Data Scrubbing (LogScrubRule model with PII patterns: Email, CreditCard, SSN, PhoneNumber, IPAddress, custom regex) - **Phase 5.3** - DateTime64 time column upgrade (DateTime64(9) nanosecond precision, toClickhouseDateTime64 utility, data migration, all ingestion services updated) +- **Phase 5.7** - Histogram Projections (`proj_severity_histogram` projection defined in Log model, aggregating by projectId, severityText, and 1-minute intervals; Projection type extended; StatementGenerator emits PROJECTION clause) ## Gap Analysis Summary @@ -57,54 +58,11 @@ The following features have been implemented and removed from this plan: **Current**: `retainTelemetryDataForDays` exists on the service model and is displayed in usage history, but there is no dedicated UI to configure retention settings. **Target**: Settings page for configuring per-service log data retention. -## Phase 5: ClickHouse Storage & Query Optimizations (P0) — Performance Foundation - -These optimizations address fundamental storage and indexing gaps in the telemetry tables that directly impact search speed, data correctness, and operational cost. - -### 5.7 Add Projections for Histogram Queries (Medium) - -**Current**: `projections: []` is empty. Every histogram query (group by time bucket + severity) and facet query scans raw data and performs the aggregation from scratch. - -**Target**: ClickHouse projections that pre-aggregate data for the most common query patterns. - -**Implementation**: - -- Add a projection for histogram/severity aggregation: - ```sql - PROJECTION proj_severity_histogram ( - SELECT - severityText, - toStartOfInterval(time, INTERVAL 1 MINUTE) AS minute, - count() AS cnt - ORDER BY (projectId, minute, severityText) - ) - ``` -- Extend the existing `Projection` type at `Common/Types/AnalyticsDatabase/Projection.ts` to support full projection definitions -- Wire projection creation into `StatementGenerator.toTableCreateStatement()` -- Migration to materialize the projection on existing data: `ALTER TABLE LogItem MATERIALIZE PROJECTION proj_severity_histogram` - -**Expected improvement**: 5-10x faster histogram queries since ClickHouse reads the pre-aggregated projection instead of scanning raw log rows. - -**Files to modify**: -- `Common/Models/AnalyticsModels/Log.ts` (define projections) -- `Common/Types/AnalyticsDatabase/Projection.ts` (extend type) -- `Common/Server/Utils/AnalyticsDatabase/StatementGenerator.ts` (emit PROJECTION clause) -- `Worker/DataMigrations/` (new migration to materialize) - -### 5.x Remaining Performance Impact Summary - -| Optimization | Query Pattern Improved | Expected Speedup | Effort | -|-------------|----------------------|-------------------|--------| -| 5.7 Histogram projections | Histogram and severity aggregation | 5-10x | Medium | - ---- - ## Recommended Remaining Implementation Order -1. **5.7** — Projections (performance polish) -2. **Log-based Metrics** (platform capability) -3. **Data Retention Config UI** (operational) -4. **Log Patterns / ML Clustering** (advanced, larger effort) +1. **Log-based Metrics** (platform capability) +2. **Data Retention Config UI** (operational) +3. **Log Patterns / ML Clustering** (advanced, larger effort) --- diff --git a/Internal/Roadmap/Metrics.md b/Internal/Roadmap/Metrics.md index 9d653929b3..15ec2caf38 100644 --- a/Internal/Roadmap/Metrics.md +++ b/Internal/Roadmap/Metrics.md @@ -20,6 +20,10 @@ The following features have been implemented: - **MetricType Auto-Discovery** - Name, description, unit captured on first ingest - **Attribute Storage** - Full JSON with extracted `attributeKeys` array for fast enumeration - **BloomFilter index** on `name`, Set index on `serviceType` +- **Phase 2.3** - Render Metric Units on Charts (MetricType.unit passed through to chart Y-axis and tooltip formatting) +- **S.1** - Fix Sort Key Order (sort key changed to `projectId, name, serviceId, time` for optimal metric name filtering) +- **S.2** - Upgrade time to DateTime64 (time column uses `TableColumnType.DateTime64` for sub-second precision) +- **S.3** - Add Skip Index on metricPointType (Set skip index `idx_metric_point_type` added to Metric model) ## Gap Analysis Summary @@ -41,7 +45,6 @@ The following features have been implemented: | Cardinality management | None | Metrics Without Limits + Explorer | Budget system + pruning rules | **P2** | | More chart types | Line and bar only | 12+ types | 10+ types with conditional coloring | **P2** | | Dashboard templates | None | Pre-built integration dashboards | Pre-built entity dashboards | **P2** | -| Units on charts | Stored but not rendered | Auto-formatted by unit type | Y-axis unit customization | **P2** | | Natural language querying | None | NLQ translates English to queries | None | **P3** | | Metric cost/volume management | None | Cost attribution dashboards | Volume dashboards | **P3** | @@ -174,25 +177,6 @@ These are table-stakes features without which the metrics product is fundamental - `Common/Types/Metrics/MetricsQuery.ts` (add compareWith field) - `App/FeatureSet/Dashboard/src/Components/Metrics/MetricGraph.tsx` (render comparison series) -### 2.3 Render Metric Units on Charts - -**Current**: Units stored in MetricType but not rendered on chart axes. -**Target**: Display units on Y-axis labels and tooltips with smart formatting. - -**Implementation**: - -- Pass `MetricType.unit` through to chart rendering -- Implement unit-aware formatting: - - Bytes: auto-convert to KB/MB/GB/TB - - Duration: auto-convert ns/us/ms/s - - Percentage: append `%` - - Rate: append `/s` -- Display formatted unit on Y-axis label and in tooltip values - -**Files to modify**: -- `App/FeatureSet/Dashboard/src/Components/Metrics/MetricGraph.tsx` (Y-axis unit formatting) -- `Common/Utils/Metrics/UnitFormatter.ts` (new - unit formatting logic) - ### 2.4 Dashboard Templates **Current**: No templates. @@ -380,40 +364,6 @@ These are table-stakes features without which the metrics product is fundamental ## ClickHouse Storage Improvements -### S.1 Fix Sort Key Order (CRITICAL) - -**Current**: Sort key is `(projectId, time, serviceId)`. -**Target**: Change to `(projectId, name, serviceId, time)`. - -**Impact**: ~100x improvement for name-filtered queries. Virtually every metric query filters by `name`, but currently ClickHouse must scan all metric names within the time range. - -**Migration**: Requires creating `MetricItem_v2` with new sort key and migrating data (ClickHouse doesn't support `ALTER TABLE MODIFY ORDER BY`). - -**Files to modify**: -- `Common/Models/AnalyticsModels/Metric.ts` (change sort key) -- `Worker/DataMigrations/` (new migration - create v2 table, backfill, swap) - -### S.2 Upgrade time to DateTime64 (HIGH) - -**Current**: `DateTime` with second precision. -**Target**: `DateTime64(3)` or `DateTime64(6)` for sub-second precision. - -**Impact**: Correct sub-second metric ordering. Removes need for separate `timeUnixNano`/`startTimeUnixNano` columns. - -**Files to modify**: -- `Common/Models/AnalyticsModels/Metric.ts` (change column type) -- `Common/Types/AnalyticsDatabase/TableColumnType.ts` (add DateTime64 type if not present) -- `Common/Server/Utils/AnalyticsDatabase/StatementGenerator.ts` (handle DateTime64) -- `Worker/DataMigrations/` (migration) - -### S.3 Add Skip Index on metricPointType (MEDIUM) - -**Current**: No index support for metric type filtering. -**Target**: Set skip index on `metricPointType`. - -**Files to modify**: -- `Common/Models/AnalyticsModels/Metric.ts` (add skip index) - ### S.4 Evaluate Map Type for Attributes (MEDIUM) **Current**: Attributes stored as JSON. @@ -428,34 +378,30 @@ These are table-stakes features without which the metrics product is fundamental ## Quick Wins (Can Ship This Week) -1. **Display units on chart Y-axes** - Data exists in MetricType, just needs wiring to chart rendering -2. **Add p50/p95/p99 to aggregation dropdown** - ClickHouse `quantile()` is straightforward to add -3. **Extend default retention** - 15 days is too short; increase default to 30 days -4. **Multi-attribute GROUP BY** - Change `groupByAttribute: string` to `groupByAttribute: string[]` -5. **Add stacked area chart type** - Simple extension of existing line chart -6. **Add skip index on metricPointType** - Low effort, faster type-filtered queries +1. **Add p50/p95/p99 to aggregation dropdown** - ClickHouse `quantile()` is straightforward to add +2. **Extend default retention** - 15 days is too short; increase default to 30 days +3. **Multi-attribute GROUP BY** - Change `groupByAttribute: string` to `groupByAttribute: string[]` +4. **Add stacked area chart type** - Simple extension of existing line chart --- ## Recommended Implementation Order -1. **Quick Wins** - Ship units on charts, p50/p95/p99, multi-attribute GROUP BY, stacked area +1. **Quick Wins** - Ship p50/p95/p99, multi-attribute GROUP BY, stacked area 2. **Phase 1.1** - Percentile aggregations (full implementation beyond quick win) 3. **Phase 1.2** - Rate/derivative calculations -4. **S.1** - Fix sort key order (critical performance improvement) -5. **Phase 1.4** - Rollups/downsampling (enables long-range queries) -6. **Phase 2.1** - More chart types (heatmap, pie, gauge, billboard) -7. **Phase 2.2** - Time-over-time comparison -8. **Phase 1.3** - Multi-attribute GROUP BY (full implementation) -9. **S.2** - Upgrade time to DateTime64 -10. **Phase 3.1** - Anomaly detection -11. **Phase 3.2** - SLO/SLI tracking -12. **Phase 2.4** - Dashboard templates -13. **Phase 4.1** - Cardinality management -14. **Phase 4.2** - Query language -15. **Phase 4.3** - Golden Signals dashboards -16. **Phase 4.4** - Predictive alerting -17. **Phase 3.3** - Metric correlations +4. **Phase 1.4** - Rollups/downsampling (enables long-range queries) +5. **Phase 2.1** - More chart types (heatmap, pie, gauge, billboard) +6. **Phase 2.2** - Time-over-time comparison +7. **Phase 1.3** - Multi-attribute GROUP BY (full implementation) +8. **Phase 3.1** - Anomaly detection +9. **Phase 3.2** - SLO/SLI tracking +10. **Phase 2.4** - Dashboard templates +11. **Phase 4.1** - Cardinality management +12. **Phase 4.2** - Query language +13. **Phase 4.3** - Golden Signals dashboards +14. **Phase 4.4** - Predictive alerting +15. **Phase 3.3** - Metric correlations ## Verification diff --git a/Internal/Roadmap/Traces.md b/Internal/Roadmap/Traces.md index e62ad562a5..9f6420ea8c 100644 --- a/Internal/Roadmap/Traces.md +++ b/Internal/Roadmap/Traces.md @@ -22,6 +22,8 @@ The following features have been implemented: - **hasException boolean column** for fast error span filtering - **links default value** corrected to `[]` - **Basic Trace-Based Alerting** - MonitorType.Traces with span count threshold alerting, span name/status/service/attribute filtering, time window (5s-24h), worker job running every minute, frontend form with preview +- **S.1** - Migrate `attributes` to Map(String, String) (TableColumnType.MapStringString in Span model with `attributeKeys` array for fast enumeration) +- **S.2** - Aggregation Projections (`proj_agg_by_service` for service-level COUNT/AVG/P99 aggregation, `proj_trace_by_id` for trace-by-ID queries) ## Gap Analysis Summary @@ -366,41 +368,9 @@ Without these, users cannot answer basic questions like "is my service healthy?" ## ClickHouse Storage Improvements -### S.1 Migrate `attributes` to Map(String, String) (HIGH) - -**Current**: `attributes` is stored as opaque `String` (JSON). Querying by attribute value requires `LIKE` or `JSONExtract()` scans. -**Target**: `Map(String, String)` type enabling `attributes['http.method'] = 'GET'` without JSON parsing. - -**Impact**: Significant query speedup for attribute-based span filtering -- the most common query pattern after time-range filtering. - -**Files to modify**: -- `Common/Models/AnalyticsModels/Span.ts` (change column type) -- `Common/Server/Utils/AnalyticsDatabase/StatementGenerator.ts` (handle Map type) -- `Telemetry/Services/OtelTracesIngestService.ts` (write Map format) -- `Worker/DataMigrations/` (new migration) - -### S.2 Add Aggregation Projection (MEDIUM) - -**Current**: `projections: []` is empty. -**Target**: Pre-aggregation projection for common dashboard queries. - -```sql -PROJECTION agg_by_service ( - SELECT - serviceId, - toStartOfMinute(startTime) AS minute, - count(), - avg(durationUnixNano), - quantile(0.99)(durationUnixNano) - GROUP BY serviceId, minute -) -``` - -**Impact**: 5-10x faster aggregation queries for service overview dashboards. - ### S.3 Add Trace-by-ID Projection (LOW) -**Current**: Trace detail view relies on BloomFilter skip index for traceId lookups. +**Current**: Trace detail view relies on BloomFilter skip index for traceId lookups. (Note: `proj_trace_by_id` projection has been added but may need evaluation for further optimization.) **Target**: Projection sorted by `(projectId, traceId, startTime)` for faster trace-by-ID queries. --- @@ -426,11 +396,10 @@ PROJECTION agg_by_service ( 5. **Phase 2.1** - Flame Graph View (industry-standard visualization) 6. **Phase 2.2** - Critical Path Analysis (key debugging capability) 7. **Phase 1.4** - Head-Based Sampling (essential for high-volume users) -8. **S.1** - Migrate attributes to Map type (storage optimization) -9. **Phase 2.3-2.5** - In-trace search, per-trace map, span links -10. **Phase 3.1** - Trace-to-Metric Exemplars -11. **Phase 3.2-3.4** - Custom metrics, structural queries, comparison -12. **Phase 4.x** - AI/ML, RUM, profiling (long-term) +8. **Phase 2.3-2.5** - In-trace search, per-trace map, span links +9. **Phase 3.1** - Trace-to-Metric Exemplars +10. **Phase 3.2-3.4** - Custom metrics, structural queries, comparison +11. **Phase 4.x** - AI/ML, RUM, profiling (long-term) ## Verification