From 150a9b15c9519266281f21d66a7c6dbb4102ec97 Mon Sep 17 00:00:00 2001 From: Wayne <5291640+ringoinca@users.noreply.github.com> Date: Mon, 13 Oct 2025 15:25:46 +0200 Subject: [PATCH] feat(attachments): De-duplicate attachment content by content hash This commit refactors attachment handling to allow multiple emails within the same ingestion source to reference attachments with identical content (same hash). Changes: - The unique index on the `attachments` table has been changed to a non-unique index to permit duplicate hash/source pairs. - The ingestion logic is updated to first check for an existing attachment with the same hash and source. If found, it reuses the existing record; otherwise, it creates a new one. This maintains storage de-duplication. - The email deletion logic is improved to be more robust. It now correctly removes the email-attachment link before checking if the attachment record and its corresponding file can be safely deleted. --- .../migrations/0023_swift_swordsman.sql | 2 + .../migrations/meta/0023_snapshot.json | 1326 +++++++++++++++++ .../database/migrations/meta/_journal.json | 7 + .../src/database/schema/attachments.ts | 4 +- .../src/services/ArchivedEmailService.ts | 42 +- .../backend/src/services/IngestionService.ts | 74 +- 6 files changed, 1408 insertions(+), 47 deletions(-) create mode 100644 packages/backend/src/database/migrations/0023_swift_swordsman.sql create mode 100644 packages/backend/src/database/migrations/meta/0023_snapshot.json diff --git a/packages/backend/src/database/migrations/0023_swift_swordsman.sql b/packages/backend/src/database/migrations/0023_swift_swordsman.sql new file mode 100644 index 0000000..cf8e632 --- /dev/null +++ b/packages/backend/src/database/migrations/0023_swift_swordsman.sql @@ -0,0 +1,2 @@ +DROP INDEX "source_hash_unique";--> statement-breakpoint +CREATE INDEX "source_hash_idx" ON "attachments" USING btree ("ingestion_source_id","content_hash_sha256"); \ No newline at end of file diff --git a/packages/backend/src/database/migrations/meta/0023_snapshot.json b/packages/backend/src/database/migrations/meta/0023_snapshot.json new file mode 100644 index 0000000..724edd9 --- /dev/null +++ b/packages/backend/src/database/migrations/meta/0023_snapshot.json @@ -0,0 +1,1326 @@ +{ + "id": "2747b009-4502-4e19-a725-1c5e9807c52b", + "prevId": "d61a882f-72c4-4fbb-8a6e-6c9dc8372679", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.archived_emails": { + "name": "archived_emails", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "thread_id": { + "name": "thread_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "ingestion_source_id": { + "name": "ingestion_source_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "user_email": { + "name": "user_email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "message_id_header": { + "name": "message_id_header", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "sent_at": { + "name": "sent_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true + }, + "subject": { + "name": "subject", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "sender_name": { + "name": "sender_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "sender_email": { + "name": "sender_email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "recipients": { + "name": "recipients", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "storage_path": { + "name": "storage_path", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "storage_hash_sha256": { + "name": "storage_hash_sha256", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "size_bytes": { + "name": "size_bytes", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "is_indexed": { + "name": "is_indexed", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "has_attachments": { + "name": "has_attachments", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "is_on_legal_hold": { + "name": "is_on_legal_hold", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "archived_at": { + "name": "archived_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "path": { + "name": "path", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "tags": { + "name": "tags", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "thread_id_idx": { + "name": "thread_id_idx", + "columns": [ + { + "expression": "thread_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "archived_emails_ingestion_source_id_ingestion_sources_id_fk": { + "name": "archived_emails_ingestion_source_id_ingestion_sources_id_fk", + "tableFrom": "archived_emails", + "tableTo": "ingestion_sources", + "columnsFrom": [ + "ingestion_source_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.attachments": { + "name": "attachments", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "filename": { + "name": "filename", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "mime_type": { + "name": "mime_type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "size_bytes": { + "name": "size_bytes", + "type": "bigint", + "primaryKey": false, + "notNull": true + }, + "content_hash_sha256": { + "name": "content_hash_sha256", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "storage_path": { + "name": "storage_path", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "ingestion_source_id": { + "name": "ingestion_source_id", + "type": "uuid", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "source_hash_idx": { + "name": "source_hash_idx", + "columns": [ + { + "expression": "ingestion_source_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "content_hash_sha256", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "attachments_ingestion_source_id_ingestion_sources_id_fk": { + "name": "attachments_ingestion_source_id_ingestion_sources_id_fk", + "tableFrom": "attachments", + "tableTo": "ingestion_sources", + "columnsFrom": [ + "ingestion_source_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.email_attachments": { + "name": "email_attachments", + "schema": "", + "columns": { + "email_id": { + "name": "email_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "attachment_id": { + "name": "attachment_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "email_attachments_email_id_archived_emails_id_fk": { + "name": "email_attachments_email_id_archived_emails_id_fk", + "tableFrom": "email_attachments", + "tableTo": "archived_emails", + "columnsFrom": [ + "email_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "email_attachments_attachment_id_attachments_id_fk": { + "name": "email_attachments_attachment_id_attachments_id_fk", + "tableFrom": "email_attachments", + "tableTo": "attachments", + "columnsFrom": [ + "attachment_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "email_attachments_email_id_attachment_id_pk": { + "name": "email_attachments_email_id_attachment_id_pk", + "columns": [ + "email_id", + "attachment_id" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.audit_logs": { + "name": "audit_logs", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "bigserial", + "primaryKey": true, + "notNull": true + }, + "previous_hash": { + "name": "previous_hash", + "type": "varchar(64)", + "primaryKey": false, + "notNull": false + }, + "timestamp": { + "name": "timestamp", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "actor_identifier": { + "name": "actor_identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "actor_ip": { + "name": "actor_ip", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "action_type": { + "name": "action_type", + "type": "audit_log_action", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "target_type": { + "name": "target_type", + "type": "audit_log_target_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": false + }, + "target_id": { + "name": "target_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "details": { + "name": "details", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "current_hash": { + "name": "current_hash", + "type": "varchar(64)", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.ediscovery_cases": { + "name": "ediscovery_cases", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'open'" + }, + "created_by_identifier": { + "name": "created_by_identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "ediscovery_cases_name_unique": { + "name": "ediscovery_cases_name_unique", + "nullsNotDistinct": false, + "columns": [ + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.export_jobs": { + "name": "export_jobs", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "case_id": { + "name": "case_id", + "type": "uuid", + "primaryKey": false, + "notNull": false + }, + "format": { + "name": "format", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "query": { + "name": "query", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "file_path": { + "name": "file_path", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_by_identifier": { + "name": "created_by_identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": { + "export_jobs_case_id_ediscovery_cases_id_fk": { + "name": "export_jobs_case_id_ediscovery_cases_id_fk", + "tableFrom": "export_jobs", + "tableTo": "ediscovery_cases", + "columnsFrom": [ + "case_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.legal_holds": { + "name": "legal_holds", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "case_id": { + "name": "case_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "custodian_id": { + "name": "custodian_id", + "type": "uuid", + "primaryKey": false, + "notNull": false + }, + "hold_criteria": { + "name": "hold_criteria", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "reason": { + "name": "reason", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "applied_by_identifier": { + "name": "applied_by_identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "applied_at": { + "name": "applied_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "removed_at": { + "name": "removed_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": { + "legal_holds_case_id_ediscovery_cases_id_fk": { + "name": "legal_holds_case_id_ediscovery_cases_id_fk", + "tableFrom": "legal_holds", + "tableTo": "ediscovery_cases", + "columnsFrom": [ + "case_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "legal_holds_custodian_id_custodians_id_fk": { + "name": "legal_holds_custodian_id_custodians_id_fk", + "tableFrom": "legal_holds", + "tableTo": "custodians", + "columnsFrom": [ + "custodian_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.retention_policies": { + "name": "retention_policies", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "priority": { + "name": "priority", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "retention_period_days": { + "name": "retention_period_days", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "action_on_expiry": { + "name": "action_on_expiry", + "type": "retention_action", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "is_enabled": { + "name": "is_enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "conditions": { + "name": "conditions", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "retention_policies_name_unique": { + "name": "retention_policies_name_unique", + "nullsNotDistinct": false, + "columns": [ + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.custodians": { + "name": "custodians", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "display_name": { + "name": "display_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "source_type": { + "name": "source_type", + "type": "ingestion_provider", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "custodians_email_unique": { + "name": "custodians_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.ingestion_sources": { + "name": "ingestion_sources", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "user_id": { + "name": "user_id", + "type": "uuid", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider": { + "name": "provider", + "type": "ingestion_provider", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "credentials": { + "name": "credentials", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "ingestion_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true, + "default": "'pending_auth'" + }, + "last_sync_started_at": { + "name": "last_sync_started_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "last_sync_finished_at": { + "name": "last_sync_finished_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + }, + "last_sync_status_message": { + "name": "last_sync_status_message", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "sync_state": { + "name": "sync_state", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": { + "ingestion_sources_user_id_users_id_fk": { + "name": "ingestion_sources_user_id_users_id_fk", + "tableFrom": "ingestion_sources", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.roles": { + "name": "roles", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "policies": { + "name": "policies", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "slug": { + "name": "slug", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "roles_name_unique": { + "name": "roles_name_unique", + "nullsNotDistinct": false, + "columns": [ + "name" + ] + }, + "roles_slug_unique": { + "name": "roles_slug_unique", + "nullsNotDistinct": false, + "columns": [ + "slug" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sessions": { + "name": "sessions", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "sessions_user_id_users_id_fk": { + "name": "sessions_user_id_users_id_fk", + "tableFrom": "sessions", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user_roles": { + "name": "user_roles", + "schema": "", + "columns": { + "user_id": { + "name": "user_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "role_id": { + "name": "role_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "user_roles_user_id_users_id_fk": { + "name": "user_roles_user_id_users_id_fk", + "tableFrom": "user_roles", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "user_roles_role_id_roles_id_fk": { + "name": "user_roles_role_id_roles_id_fk", + "tableFrom": "user_roles", + "tableTo": "roles", + "columnsFrom": [ + "role_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "user_roles_user_id_role_id_pk": { + "name": "user_roles_user_id_role_id_pk", + "columns": [ + "user_id", + "role_id" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.users": { + "name": "users", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "first_name": { + "name": "first_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "last_name": { + "name": "last_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "password": { + "name": "password", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "provider": { + "name": "provider", + "type": "text", + "primaryKey": false, + "notNull": false, + "default": "'local'" + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "users_email_unique": { + "name": "users_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.system_settings": { + "name": "system_settings", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.api_keys": { + "name": "api_keys", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "key": { + "name": "key", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "key_hash": { + "name": "key_hash", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": { + "api_keys_user_id_users_id_fk": { + "name": "api_keys_user_id_users_id_fk", + "tableFrom": "api_keys", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": { + "public.retention_action": { + "name": "retention_action", + "schema": "public", + "values": [ + "delete_permanently", + "notify_admin" + ] + }, + "public.ingestion_provider": { + "name": "ingestion_provider", + "schema": "public", + "values": [ + "google_workspace", + "microsoft_365", + "generic_imap", + "pst_import", + "eml_import", + "mbox_import" + ] + }, + "public.ingestion_status": { + "name": "ingestion_status", + "schema": "public", + "values": [ + "active", + "paused", + "error", + "pending_auth", + "syncing", + "importing", + "auth_success", + "imported" + ] + }, + "public.audit_log_action": { + "name": "audit_log_action", + "schema": "public", + "values": [ + "CREATE", + "READ", + "UPDATE", + "DELETE", + "LOGIN", + "LOGOUT", + "SETUP", + "IMPORT", + "PAUSE", + "SYNC", + "UPLOAD", + "SEARCH", + "DOWNLOAD", + "GENERATE" + ] + }, + "public.audit_log_target_type": { + "name": "audit_log_target_type", + "schema": "public", + "values": [ + "ApiKey", + "ArchivedEmail", + "Dashboard", + "IngestionSource", + "Role", + "SystemSettings", + "User", + "File" + ] + } + }, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/backend/src/database/migrations/meta/_journal.json b/packages/backend/src/database/migrations/meta/_journal.json index 2204d41..921d559 100644 --- a/packages/backend/src/database/migrations/meta/_journal.json +++ b/packages/backend/src/database/migrations/meta/_journal.json @@ -162,6 +162,13 @@ "when": 1759701622932, "tag": "0022_complete_triton", "breakpoints": true + }, + { + "idx": 23, + "version": "7", + "when": 1760354094610, + "tag": "0023_swift_swordsman", + "breakpoints": true } ] } \ No newline at end of file diff --git a/packages/backend/src/database/schema/attachments.ts b/packages/backend/src/database/schema/attachments.ts index fcb1e49..8d10fbe 100644 --- a/packages/backend/src/database/schema/attachments.ts +++ b/packages/backend/src/database/schema/attachments.ts @@ -1,5 +1,5 @@ import { relations } from 'drizzle-orm'; -import { pgTable, text, uuid, bigint, primaryKey, uniqueIndex } from 'drizzle-orm/pg-core'; +import { pgTable, text, uuid, bigint, primaryKey, index } from 'drizzle-orm/pg-core'; import { archivedEmails } from './archived-emails'; import { ingestionSources } from './ingestion-sources'; @@ -17,7 +17,7 @@ export const attachments = pgTable( }), }, (table) => [ - uniqueIndex('source_hash_unique').on(table.ingestionSourceId, table.contentHashSha256), + index('source_hash_idx').on(table.ingestionSourceId, table.contentHashSha256), ] ); diff --git a/packages/backend/src/services/ArchivedEmailService.ts b/packages/backend/src/services/ArchivedEmailService.ts index 9be1228..ccf322d 100644 --- a/packages/backend/src/services/ArchivedEmailService.ts +++ b/packages/backend/src/services/ArchivedEmailService.ts @@ -213,7 +213,7 @@ export class ArchivedEmailService { // Load and handle attachments before deleting the email itself if (email.hasAttachments) { - const emailAttachmentsResult = await db + const attachmentsForEmail = await db .select({ attachmentId: attachments.id, storagePath: attachments.storagePath, @@ -223,37 +223,33 @@ export class ArchivedEmailService { .where(eq(emailAttachments.emailId, emailId)); try { - for (const attachment of emailAttachmentsResult) { - const [refCount] = await db - .select({ count: count(emailAttachments.emailId) }) + for (const attachment of attachmentsForEmail) { + // Delete the link between this email and the attachment record. + await db + .delete(emailAttachments) + .where( + and( + eq(emailAttachments.emailId, emailId), + eq(emailAttachments.attachmentId, attachment.attachmentId) + ) + ); + + // Check if any other emails are linked to this attachment record. + const [recordRefCount] = await db + .select({ count: count() }) .from(emailAttachments) .where(eq(emailAttachments.attachmentId, attachment.attachmentId)); - if (refCount.count === 1) { + // If no other emails are linked to this record, it's safe to delete it and the file. + if (recordRefCount.count === 0) { await storage.delete(attachment.storagePath); - await db - .delete(emailAttachments) - .where( - and( - eq(emailAttachments.emailId, emailId), - eq(emailAttachments.attachmentId, attachment.attachmentId) - ) - ); await db .delete(attachments) .where(eq(attachments.id, attachment.attachmentId)); - } else { - await db - .delete(emailAttachments) - .where( - and( - eq(emailAttachments.emailId, emailId), - eq(emailAttachments.attachmentId, attachment.attachmentId) - ) - ); } } - } catch { + } catch (error) { + console.error('Failed to delete email attachments', error); throw new Error('Failed to delete email attachments'); } } diff --git a/packages/backend/src/services/IngestionService.ts b/packages/backend/src/services/IngestionService.ts index 4c2ae59..6f01233 100644 --- a/packages/backend/src/services/IngestionService.ts +++ b/packages/backend/src/services/IngestionService.ts @@ -20,7 +20,7 @@ import { attachments as attachmentsSchema, emailAttachments, } from '../database/schema'; -import { createHash } from 'crypto'; +import { createHash, randomUUID } from 'crypto'; import { logger } from '../config/logger'; import { SearchService } from './SearchService'; import { config } from '../config/index'; @@ -456,33 +456,63 @@ export class IngestionService { const attachmentHash = createHash('sha256') .update(attachmentBuffer) .digest('hex'); - const attachmentPath = `${config.storage.openArchiverFolderName}/${source.name.replaceAll(' ', '-')}-${source.id}/attachments/${attachment.filename}`; - await storage.put(attachmentPath, attachmentBuffer); - const [newAttachment] = await db - .insert(attachmentsSchema) - .values({ - filename: attachment.filename, - mimeType: attachment.contentType, - sizeBytes: attachment.size, - contentHashSha256: attachmentHash, - storagePath: attachmentPath, - ingestionSourceId: source.id, - }) - .onConflictDoUpdate({ - target: [ - attachmentsSchema.ingestionSourceId, - attachmentsSchema.contentHashSha256, - ], - set: { filename: attachment.filename }, - }) - .returning(); + // Check if an attachment with the same hash already exists for this source + const existingAttachment = await db.query.attachments.findFirst({ + where: and( + eq(attachmentsSchema.contentHashSha256, attachmentHash), + eq(attachmentsSchema.ingestionSourceId, source.id) + ), + }); + let storagePath: string; + + if (existingAttachment) { + // If it exists, reuse the storage path and don't save the file again + storagePath = existingAttachment.storagePath; + logger.info( + { + attachmentHash, + ingestionSourceId: source.id, + reusedPath: storagePath, + }, + 'Reusing existing attachment file for deduplication.' + ); + } else { + // If it's a new attachment, create a unique path and save it + const uniqueId = randomUUID().slice(0, 7); + storagePath = `${config.storage.openArchiverFolderName}/${source.name.replaceAll(' ', '-')}-${source.id}/attachments/${uniqueId}-${attachment.filename}`; + await storage.put(storagePath, attachmentBuffer); + } + + let attachmentRecord = existingAttachment; + + if (!attachmentRecord) { + // If it's a new attachment, create a unique path and save it + const uniqueId = randomUUID().slice(0, 5); + const storagePath = `${config.storage.openArchiverFolderName}/${source.name.replaceAll(' ', '-')}-${source.id}/attachments/${uniqueId}-${attachment.filename}`; + await storage.put(storagePath, attachmentBuffer); + + // Insert a new attachment record + [attachmentRecord] = await db + .insert(attachmentsSchema) + .values({ + filename: attachment.filename, + mimeType: attachment.contentType, + sizeBytes: attachment.size, + contentHashSha256: attachmentHash, + storagePath: storagePath, + ingestionSourceId: source.id, + }) + .returning(); + } + + // Link the attachment record (either new or existing) to the email await db .insert(emailAttachments) .values({ emailId: archivedEmail.id, - attachmentId: newAttachment.id, + attachmentId: attachmentRecord.id, }) .onConflictDoNothing(); }