Scope attachment deduplication to ingestion source

Previously, attachment deduplication was handled globally by enforcing a unique constraint on the content hash (contentHashSha256) in the `attachments` table. This caused an issue where an attachment from one ingestion source would be incorrectly linked if the same attachment was processed by a different source.

This commit refactors the deduplication logic to be scoped on a per-ingestion-source basis.

Changes:
-   **Schema:** The `attachments` table schema has been updated to include a nullable `ingestionSourceId` column. A composite unique index has been added on `(ingestionSourceId, contentHashSha256)` to enforce per-source uniqueness. The `ingestionSourceId` is nullable to ensure backward compatibility with existing databases.
-   **Ingestion Logic:** The `IngestionService` has been updated to provide the `ingestionSourceId` when inserting attachment records. The `onConflictDoUpdate` clause now targets the new composite key, ensuring that attachments are only considered duplicates if they have the same hash and originate from the same ingestion source.
This commit is contained in:
Wayne
2025-10-06 00:04:34 +02:00
parent 2a3d6846d8
commit 659d130f3b
7 changed files with 1366 additions and 14 deletions

4
.gitignore vendored
View File

@@ -27,6 +27,4 @@ docs/.vitepress/cache
# TS
tsconfig.tsbuildinfo
packages/backend/tsconfig.tsbuildinfo
packages/types/tsconfig.tsbuildinfo
**/tsconfig.tsbuildinfo

View File

@@ -0,0 +1,4 @@
ALTER TABLE "attachments" DROP CONSTRAINT "attachments_content_hash_sha256_unique";--> statement-breakpoint
ALTER TABLE "attachments" ADD COLUMN "ingestion_source_id" uuid;--> statement-breakpoint
ALTER TABLE "attachments" ADD CONSTRAINT "attachments_ingestion_source_id_ingestion_sources_id_fk" FOREIGN KEY ("ingestion_source_id") REFERENCES "public"."ingestion_sources"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
CREATE UNIQUE INDEX "source_hash_unique" ON "attachments" USING btree ("ingestion_source_id","content_hash_sha256");

File diff suppressed because it is too large Load Diff

View File

@@ -155,6 +155,13 @@
"when": 1759412986134,
"tag": "0021_nosy_veda",
"breakpoints": true
},
{
"idx": 22,
"version": "7",
"when": 1759701622932,
"tag": "0022_complete_triton",
"breakpoints": true
}
]
}

View File

@@ -1,15 +1,25 @@
import { relations } from 'drizzle-orm';
import { pgTable, text, uuid, bigint, primaryKey } from 'drizzle-orm/pg-core';
import { pgTable, text, uuid, bigint, primaryKey, uniqueIndex } from 'drizzle-orm/pg-core';
import { archivedEmails } from './archived-emails';
import { ingestionSources } from './ingestion-sources';
export const attachments = pgTable('attachments', {
id: uuid('id').primaryKey().defaultRandom(),
filename: text('filename').notNull(),
mimeType: text('mime_type'),
sizeBytes: bigint('size_bytes', { mode: 'number' }).notNull(),
contentHashSha256: text('content_hash_sha256').notNull().unique(),
storagePath: text('storage_path').notNull(),
});
export const attachments = pgTable(
'attachments',
{
id: uuid('id').primaryKey().defaultRandom(),
filename: text('filename').notNull(),
mimeType: text('mime_type'),
sizeBytes: bigint('size_bytes', { mode: 'number' }).notNull(),
contentHashSha256: text('content_hash_sha256').notNull(),
storagePath: text('storage_path').notNull(),
ingestionSourceId: uuid('ingestion_source_id').references(() => ingestionSources.id, {
onDelete: 'cascade',
}),
},
(table) => [
uniqueIndex('source_hash_unique').on(table.ingestionSourceId, table.contentHashSha256),
]
);
export const emailAttachments = pgTable(
'email_attachments',

View File

@@ -465,9 +465,13 @@ export class IngestionService {
sizeBytes: attachment.size,
contentHashSha256: attachmentHash,
storagePath: attachmentPath,
ingestionSourceId: source.id,
})
.onConflictDoUpdate({
target: attachmentsSchema.contentHashSha256,
target: [
attachmentsSchema.ingestionSourceId,
attachmentsSchema.contentHashSha256,
],
set: { filename: attachment.filename },
})
.returning();

View File

@@ -6,7 +6,10 @@ import { createCipheriv, createDecipheriv, randomBytes } from 'crypto';
import { streamToBuffer } from '../helpers/streamToBuffer';
import { Readable } from 'stream';
const ENCRYPTION_PREFIX = Buffer.from('oa_enc_v1::');
/**
* A unique identifier for Open Archiver encrypted files. This value SHOULD NOT BE ALTERED in future development to ensure compatibility.
*/
const ENCRYPTION_PREFIX = Buffer.from('oa_enc_idf_v1::');
export class StorageService implements IStorageProvider {
private provider: IStorageProvider;