diff --git a/packages/backend/package.json b/packages/backend/package.json index df04539..5e08d98 100644 --- a/packages/backend/package.json +++ b/packages/backend/package.json @@ -56,6 +56,7 @@ "mammoth": "^1.9.1", "meilisearch": "^0.51.0", "multer": "^2.0.2", + "nodemailer": "^8.0.2", "pdf2json": "^3.1.6", "pg": "^8.16.3", "pino": "^9.7.0", @@ -77,6 +78,7 @@ "@types/microsoft-graph": "^2.40.1", "@types/multer": "^2.0.0", "@types/node": "^24.0.12", + "@types/nodemailer": "^7.0.11", "@types/yauzl": "^2.10.3", "ts-node-dev": "^2.0.0", "tsconfig-paths": "^4.2.0", diff --git a/packages/backend/src/helpers/emlUtils.ts b/packages/backend/src/helpers/emlUtils.ts new file mode 100644 index 0000000..19597c3 --- /dev/null +++ b/packages/backend/src/helpers/emlUtils.ts @@ -0,0 +1,218 @@ +import { simpleParser, type Attachment } from 'mailparser'; +import MailComposer from 'nodemailer/lib/mail-composer'; +import type Mail from 'nodemailer/lib/mailer'; +import { logger } from '../config/logger'; + +/** + * Set of headers that are either handled natively by nodemailer's MailComposer + * via dedicated options, or are structural MIME headers that will be regenerated + * when the MIME tree is rebuilt. + */ +const HEADERS_HANDLED_BY_COMPOSER = new Set([ + 'content-type', + 'content-transfer-encoding', + 'mime-version', + 'from', + 'to', + 'cc', + 'bcc', + 'subject', + 'message-id', + 'date', + 'in-reply-to', + 'references', + 'reply-to', + 'sender', +]); + +/** + * Determines whether a parsed attachment should be preserved in the stored .eml. + * + * An attachment is considered inline if: + * 1. mailparser explicitly marked it as related (embedded in multipart/related) + * 2. It has Content-Disposition: inline AND a Content-ID + * 3. Its Content-ID is referenced as a cid: URL in the HTML body + * + * All three checks are evaluated with OR logic (conservative: keep if any match). + */ +function isInlineAttachment(attachment: Attachment, referencedCids: Set): boolean { + // Signal 1: mailparser marks embedded multipart/related resources + if (attachment.related === true) { + return true; + } + + if (attachment.cid) { + const normalizedCid = attachment.cid.toLowerCase(); + + // Signal 2: explicitly marked inline with a CID + if (attachment.contentDisposition === 'inline') { + return true; + } + + // Signal 3: CID is actively referenced in the HTML body + if (referencedCids.has(normalizedCid)) { + return true; + } + } + + return false; +} + +/** + * Extracts cid: references from an HTML string. + * Matches patterns like src="cid:abc123" in img tags or CSS backgrounds. + * + * @returns A Set of normalized (lowercased) CID values without the "cid:" prefix. + */ +function extractCidReferences(html: string): Set { + const cidPattern = /\bcid:([^\s"'>]+)/gi; + const cids = new Set(); + let match: RegExpExecArray | null; + while ((match = cidPattern.exec(html)) !== null) { + cids.add(match[1].toLowerCase()); + } + return cids; +} + +/** + * Extracts additional headers from the parsed email's header map that are NOT + * handled natively by nodemailer's MailComposer dedicated options. + * These are passed through as custom headers to preserve the original email metadata. + */ +function extractAdditionalHeaders( + headers: Map +): Array<{ key: string; value: string }> { + const result: Array<{ key: string; value: string }> = []; + + for (const [key, value] of headers) { + if (HEADERS_HANDLED_BY_COMPOSER.has(key.toLowerCase())) { + continue; + } + + if (typeof value === 'string') { + result.push({ key, value }); + } else if (Array.isArray(value)) { + // Headers like 'received' can appear multiple times + for (const item of value) { + if (typeof item === 'string') { + result.push({ key, value: item }); + } else if (item && typeof item === 'object' && 'value' in item) { + result.push({ key, value: String(item.value) }); + } + } + } else if (value && typeof value === 'object' && 'value' in value) { + // Structured headers like { value: '...', params: {...} } + result.push({ key, value: String((value as { value: string }).value) }); + } + } + + return result; +} + +/** + * Converts a mailparser AddressObject or AddressObject[] to a comma-separated string + * suitable for nodemailer's MailComposer options. + */ +function addressToString( + addresses: import('mailparser').AddressObject | import('mailparser').AddressObject[] | undefined +): string | undefined { + if (!addresses) return undefined; + const arr = Array.isArray(addresses) ? addresses : [addresses]; + return arr.map((a) => a.text).join(', ') || undefined; +} + +/** + * Strips non-inline attachments from a raw .eml buffer to avoid double-storing + * attachment data (since attachments are already stored separately). + * + * Inline images referenced via cid: in the HTML body are preserved so that + * the email renders correctly when viewed. + * + * If the email has no strippable attachments, the original buffer is returned + * unchanged (zero overhead). + * + * If re-serialization fails for any reason, the original buffer is returned + * and a warning is logged — email ingestion is never blocked by this function. + * + * @param emlBuffer The raw .eml file as a Buffer. + * @returns A new Buffer with non-inline attachments removed, or the original if nothing was stripped. + */ +export async function stripAttachmentsFromEml(emlBuffer: Buffer): Promise { + try { + const parsed = await simpleParser(emlBuffer); + + // If there are no attachments at all, return early + if (!parsed.attachments || parsed.attachments.length === 0) { + return emlBuffer; + } + + // Build the set of cid values referenced in the HTML body + const htmlBody = parsed.html || ''; + const referencedCids = extractCidReferences(htmlBody); + + // Check if there's anything to strip + const hasStrippableAttachments = parsed.attachments.some( + (a) => !isInlineAttachment(a, referencedCids) + ); + + if (!hasStrippableAttachments) { + return emlBuffer; + } + + // Build the list of inline attachments to preserve in the .eml + const inlineAttachments: Mail.Attachment[] = []; + for (const attachment of parsed.attachments) { + if (isInlineAttachment(attachment, referencedCids)) { + inlineAttachments.push({ + content: attachment.content, + contentType: attachment.contentType, + contentDisposition: 'inline' as const, + filename: attachment.filename || undefined, + cid: attachment.cid || undefined, + }); + } + } + + // Collect additional headers not handled by MailComposer's dedicated fields + const additionalHeaders = extractAdditionalHeaders(parsed.headers); + + // Build the mail options for MailComposer + const mailOptions: Mail.Options = { + from: addressToString(parsed.from), + to: addressToString(parsed.to), + cc: addressToString(parsed.cc), + bcc: addressToString(parsed.bcc), + replyTo: addressToString(parsed.replyTo), + subject: parsed.subject, + messageId: parsed.messageId, + date: parsed.date, + inReplyTo: parsed.inReplyTo, + references: Array.isArray(parsed.references) + ? parsed.references.join(' ') + : parsed.references, + text: parsed.text || undefined, + html: parsed.html || undefined, + attachments: inlineAttachments, + headers: additionalHeaders, + }; + + const composer = new MailComposer(mailOptions); + const builtMessage = composer.compile(); + const stream = builtMessage.createReadStream(); + + return await new Promise((resolve, reject) => { + const chunks: Buffer[] = []; + stream.on('data', (chunk: Buffer) => chunks.push(chunk)); + stream.on('end', () => resolve(Buffer.concat(chunks))); + stream.on('error', reject); + }); + } catch (error) { + // If stripping fails, return the original buffer unchanged. + // Email ingestion should never be blocked by an attachment-stripping failure. + logger.warn( + { error }, + 'Failed to strip non-inline attachments from .eml — storing original.' + ); + return emlBuffer; + } +} diff --git a/packages/backend/src/services/IngestionService.ts b/packages/backend/src/services/IngestionService.ts index 037cf41..ddfc299 100644 --- a/packages/backend/src/services/IngestionService.ts +++ b/packages/backend/src/services/IngestionService.ts @@ -15,6 +15,7 @@ import { ingestionQueue } from '../jobs/queues'; import type { JobType } from 'bullmq'; import { StorageService } from './StorageService'; import type { IInitialImportJob, EmailObject } from '@open-archiver/types'; +import { stripAttachmentsFromEml } from '../helpers/emlUtils'; import { archivedEmails, attachments as attachmentsSchema, @@ -446,7 +447,10 @@ export class IngestionService { return null; } - const emlBuffer = email.eml ?? Buffer.from(email.body, 'utf-8'); + const rawEmlBuffer = email.eml ?? Buffer.from(email.body, 'utf-8'); + // Strip non-inline attachments from the .eml to avoid double-storing + // attachment data (attachments are stored separately). + const emlBuffer = await stripAttachmentsFromEml(rawEmlBuffer); const emailHash = createHash('sha256').update(emlBuffer).digest('hex'); const sanitizedPath = email.path ? email.path : ''; const emailPath = `${config.storage.openArchiverFolderName}/${source.name.replaceAll(' ', '-')}-${source.id}/emails/${sanitizedPath}${email.id}.eml`; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c793899..97aedbd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -167,6 +167,9 @@ importers: multer: specifier: ^2.0.2 version: 2.0.2 + nodemailer: + specifier: ^8.0.2 + version: 8.0.2 pdf2json: specifier: ^3.1.6 version: 3.1.6 @@ -225,6 +228,9 @@ importers: '@types/node': specifier: ^24.0.12 version: 24.0.13 + '@types/nodemailer': + specifier: ^7.0.11 + version: 7.0.11 '@types/yauzl': specifier: ^2.10.3 version: 2.10.3 @@ -1883,6 +1889,9 @@ packages: '@types/node@24.0.13': resolution: {integrity: sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==} + '@types/nodemailer@7.0.11': + resolution: {integrity: sha512-E+U4RzR2dKrx+u3N4DlsmLaDC6mMZOM/TPROxA0UAPiTgI0y4CEFBmZE+coGWTjakDriRsXG368lNk1u9Q0a2g==} + '@types/qs@6.14.0': resolution: {integrity: sha512-eOunJqu0K1923aExK6y8p6fsihYEn/BYuQ4g0CxAAgFc4b/ZLN4CrsRZ55srTdqoiLzU2B2evC+apEIxprEzkQ==} @@ -2974,11 +2983,12 @@ packages: glob@10.4.5: resolution: {integrity: sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==} + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me hasBin: true glob@7.2.3: resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} - deprecated: Glob versions prior to v9 are no longer supported + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me google-auth-library@10.1.0: resolution: {integrity: sha512-GspVjZj1RbyRWpQ9FbAXMKjFGzZwDKnUHi66JJ+tcjcu5/xYAP1pdlWotCuIkMwjfVsxxDvsGZXGLzRt72D0sQ==} @@ -3413,6 +3423,7 @@ packages: mailsplit@5.4.5: resolution: {integrity: sha512-oMfhmvclR689IIaQmIcR5nODnZRRVwAKtqFT407TIvmhX2OLUBnshUTcxzQBt3+96sZVDud9NfSe1NxAkUNXEQ==} + deprecated: This package has been renamed to @zone-eu/mailsplit. Please update your dependencies. make-error@1.3.6: resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==} @@ -3663,6 +3674,10 @@ packages: resolution: {integrity: sha512-nsrh2lO3j4GkLLXoeEksAMgAOqxOv6QumNRVQTJwKH4nuiww6iC2y7GyANs9kRAxCexg3+lTWM3PZ91iLlVjfg==} engines: {node: '>=6.0.0'} + nodemailer@8.0.2: + resolution: {integrity: sha512-zbj002pZAIkWQFxyAaqoxvn+zoIwRnS40hgjqTXudKOOJkiFFgBeNqjgD3/YCR12sZnrghWYBY+yP1ZucdDRpw==} + engines: {node: '>=6.0.0'} + nopt@5.0.0: resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} engines: {node: '>=6'} @@ -3844,6 +3859,7 @@ packages: prebuild-install@7.1.3: resolution: {integrity: sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==} engines: {node: '>=10'} + deprecated: No longer maintained. Please contact the author of the relevant native addon; alternatives are available. hasBin: true prettier-plugin-svelte@3.4.0: @@ -6476,6 +6492,10 @@ snapshots: dependencies: undici-types: 7.8.0 + '@types/nodemailer@7.0.11': + dependencies: + '@types/node': 24.0.13 + '@types/qs@6.14.0': {} '@types/range-parser@1.2.7': {} @@ -8390,6 +8410,8 @@ snapshots: nodemailer@7.0.5: {} + nodemailer@8.0.2: {} + nopt@5.0.0: dependencies: abbrev: 1.1.1