mirror of
https://github.com/LogicLabs-OU/OpenArchiver.git
synced 2026-04-06 00:31:57 +02:00
Scope attachment deduplication to ingestion source
Previously, attachment deduplication was handled globally by enforcing a unique constraint on the content hash (contentHashSha256) in the `attachments` table. This caused an issue where an attachment from one ingestion source would be incorrectly linked if the same attachment was processed by a different source. This commit refactors the deduplication logic to be scoped on a per-ingestion-source basis. Changes: - **Schema:** The `attachments` table schema has been updated to include a nullable `ingestionSourceId` column. A composite unique index has been added on `(ingestionSourceId, contentHashSha256)` to enforce per-source uniqueness. The `ingestionSourceId` is nullable to ensure backward compatibility with existing databases. - **Ingestion Logic:** The `IngestionService` has been updated to provide the `ingestionSourceId` when inserting attachment records. The `onConflictDoUpdate` clause now targets the new composite key, ensuring that attachments are only considered duplicates if they have the same hash and originate from the same ingestion source.
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -27,6 +27,4 @@ docs/.vitepress/cache
|
||||
|
||||
|
||||
# TS
|
||||
tsconfig.tsbuildinfo
|
||||
packages/backend/tsconfig.tsbuildinfo
|
||||
packages/types/tsconfig.tsbuildinfo
|
||||
**/tsconfig.tsbuildinfo
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
ALTER TABLE "attachments" DROP CONSTRAINT "attachments_content_hash_sha256_unique";--> statement-breakpoint
|
||||
ALTER TABLE "attachments" ADD COLUMN "ingestion_source_id" uuid;--> statement-breakpoint
|
||||
ALTER TABLE "attachments" ADD CONSTRAINT "attachments_ingestion_source_id_ingestion_sources_id_fk" FOREIGN KEY ("ingestion_source_id") REFERENCES "public"."ingestion_sources"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
CREATE UNIQUE INDEX "source_hash_unique" ON "attachments" USING btree ("ingestion_source_id","content_hash_sha256");
|
||||
1326
packages/backend/src/database/migrations/meta/0022_snapshot.json
Normal file
1326
packages/backend/src/database/migrations/meta/0022_snapshot.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -155,6 +155,13 @@
|
||||
"when": 1759412986134,
|
||||
"tag": "0021_nosy_veda",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 22,
|
||||
"version": "7",
|
||||
"when": 1759701622932,
|
||||
"tag": "0022_complete_triton",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,15 +1,25 @@
|
||||
import { relations } from 'drizzle-orm';
|
||||
import { pgTable, text, uuid, bigint, primaryKey } from 'drizzle-orm/pg-core';
|
||||
import { pgTable, text, uuid, bigint, primaryKey, uniqueIndex } from 'drizzle-orm/pg-core';
|
||||
import { archivedEmails } from './archived-emails';
|
||||
import { ingestionSources } from './ingestion-sources';
|
||||
|
||||
export const attachments = pgTable('attachments', {
|
||||
id: uuid('id').primaryKey().defaultRandom(),
|
||||
filename: text('filename').notNull(),
|
||||
mimeType: text('mime_type'),
|
||||
sizeBytes: bigint('size_bytes', { mode: 'number' }).notNull(),
|
||||
contentHashSha256: text('content_hash_sha256').notNull().unique(),
|
||||
storagePath: text('storage_path').notNull(),
|
||||
});
|
||||
export const attachments = pgTable(
|
||||
'attachments',
|
||||
{
|
||||
id: uuid('id').primaryKey().defaultRandom(),
|
||||
filename: text('filename').notNull(),
|
||||
mimeType: text('mime_type'),
|
||||
sizeBytes: bigint('size_bytes', { mode: 'number' }).notNull(),
|
||||
contentHashSha256: text('content_hash_sha256').notNull(),
|
||||
storagePath: text('storage_path').notNull(),
|
||||
ingestionSourceId: uuid('ingestion_source_id').references(() => ingestionSources.id, {
|
||||
onDelete: 'cascade',
|
||||
}),
|
||||
},
|
||||
(table) => [
|
||||
uniqueIndex('source_hash_unique').on(table.ingestionSourceId, table.contentHashSha256),
|
||||
]
|
||||
);
|
||||
|
||||
export const emailAttachments = pgTable(
|
||||
'email_attachments',
|
||||
|
||||
@@ -465,9 +465,13 @@ export class IngestionService {
|
||||
sizeBytes: attachment.size,
|
||||
contentHashSha256: attachmentHash,
|
||||
storagePath: attachmentPath,
|
||||
ingestionSourceId: source.id,
|
||||
})
|
||||
.onConflictDoUpdate({
|
||||
target: attachmentsSchema.contentHashSha256,
|
||||
target: [
|
||||
attachmentsSchema.ingestionSourceId,
|
||||
attachmentsSchema.contentHashSha256,
|
||||
],
|
||||
set: { filename: attachment.filename },
|
||||
})
|
||||
.returning();
|
||||
|
||||
@@ -6,7 +6,10 @@ import { createCipheriv, createDecipheriv, randomBytes } from 'crypto';
|
||||
import { streamToBuffer } from '../helpers/streamToBuffer';
|
||||
import { Readable } from 'stream';
|
||||
|
||||
const ENCRYPTION_PREFIX = Buffer.from('oa_enc_v1::');
|
||||
/**
|
||||
* A unique identifier for Open Archiver encrypted files. This value SHOULD NOT BE ALTERED in future development to ensure compatibility.
|
||||
*/
|
||||
const ENCRYPTION_PREFIX = Buffer.from('oa_enc_idf_v1::');
|
||||
|
||||
export class StorageService implements IStorageProvider {
|
||||
private provider: IStorageProvider;
|
||||
|
||||
Reference in New Issue
Block a user