feat: strip non-inline attachments from EML before storage

Add nodemailer dependency and emlUtils helper to remove non-inline
attachments from .eml buffers during ingestion. This avoids
double-storing attachment data since attachments are already stored
separately.
This commit is contained in:
wayneshn
2026-03-15 00:04:31 +01:00
parent 98f7712607
commit 118b6a56be
4 changed files with 248 additions and 2 deletions

View File

@@ -56,6 +56,7 @@
"mammoth": "^1.9.1",
"meilisearch": "^0.51.0",
"multer": "^2.0.2",
"nodemailer": "^8.0.2",
"pdf2json": "^3.1.6",
"pg": "^8.16.3",
"pino": "^9.7.0",
@@ -77,6 +78,7 @@
"@types/microsoft-graph": "^2.40.1",
"@types/multer": "^2.0.0",
"@types/node": "^24.0.12",
"@types/nodemailer": "^7.0.11",
"@types/yauzl": "^2.10.3",
"ts-node-dev": "^2.0.0",
"tsconfig-paths": "^4.2.0",

View File

@@ -0,0 +1,218 @@
import { simpleParser, type Attachment } from 'mailparser';
import MailComposer from 'nodemailer/lib/mail-composer';
import type Mail from 'nodemailer/lib/mailer';
import { logger } from '../config/logger';
/**
* Set of headers that are either handled natively by nodemailer's MailComposer
* via dedicated options, or are structural MIME headers that will be regenerated
* when the MIME tree is rebuilt.
*/
const HEADERS_HANDLED_BY_COMPOSER = new Set([
'content-type',
'content-transfer-encoding',
'mime-version',
'from',
'to',
'cc',
'bcc',
'subject',
'message-id',
'date',
'in-reply-to',
'references',
'reply-to',
'sender',
]);
/**
* Determines whether a parsed attachment should be preserved in the stored .eml.
*
* An attachment is considered inline if:
* 1. mailparser explicitly marked it as related (embedded in multipart/related)
* 2. It has Content-Disposition: inline AND a Content-ID
* 3. Its Content-ID is referenced as a cid: URL in the HTML body
*
* All three checks are evaluated with OR logic (conservative: keep if any match).
*/
function isInlineAttachment(attachment: Attachment, referencedCids: Set<string>): boolean {
// Signal 1: mailparser marks embedded multipart/related resources
if (attachment.related === true) {
return true;
}
if (attachment.cid) {
const normalizedCid = attachment.cid.toLowerCase();
// Signal 2: explicitly marked inline with a CID
if (attachment.contentDisposition === 'inline') {
return true;
}
// Signal 3: CID is actively referenced in the HTML body
if (referencedCids.has(normalizedCid)) {
return true;
}
}
return false;
}
/**
* Extracts cid: references from an HTML string.
* Matches patterns like src="cid:abc123" in img tags or CSS backgrounds.
*
* @returns A Set of normalized (lowercased) CID values without the "cid:" prefix.
*/
function extractCidReferences(html: string): Set<string> {
const cidPattern = /\bcid:([^\s"'>]+)/gi;
const cids = new Set<string>();
let match: RegExpExecArray | null;
while ((match = cidPattern.exec(html)) !== null) {
cids.add(match[1].toLowerCase());
}
return cids;
}
/**
* Extracts additional headers from the parsed email's header map that are NOT
* handled natively by nodemailer's MailComposer dedicated options.
* These are passed through as custom headers to preserve the original email metadata.
*/
function extractAdditionalHeaders(
headers: Map<string, unknown>
): Array<{ key: string; value: string }> {
const result: Array<{ key: string; value: string }> = [];
for (const [key, value] of headers) {
if (HEADERS_HANDLED_BY_COMPOSER.has(key.toLowerCase())) {
continue;
}
if (typeof value === 'string') {
result.push({ key, value });
} else if (Array.isArray(value)) {
// Headers like 'received' can appear multiple times
for (const item of value) {
if (typeof item === 'string') {
result.push({ key, value: item });
} else if (item && typeof item === 'object' && 'value' in item) {
result.push({ key, value: String(item.value) });
}
}
} else if (value && typeof value === 'object' && 'value' in value) {
// Structured headers like { value: '...', params: {...} }
result.push({ key, value: String((value as { value: string }).value) });
}
}
return result;
}
/**
* Converts a mailparser AddressObject or AddressObject[] to a comma-separated string
* suitable for nodemailer's MailComposer options.
*/
function addressToString(
addresses: import('mailparser').AddressObject | import('mailparser').AddressObject[] | undefined
): string | undefined {
if (!addresses) return undefined;
const arr = Array.isArray(addresses) ? addresses : [addresses];
return arr.map((a) => a.text).join(', ') || undefined;
}
/**
* Strips non-inline attachments from a raw .eml buffer to avoid double-storing
* attachment data (since attachments are already stored separately).
*
* Inline images referenced via cid: in the HTML body are preserved so that
* the email renders correctly when viewed.
*
* If the email has no strippable attachments, the original buffer is returned
* unchanged (zero overhead).
*
* If re-serialization fails for any reason, the original buffer is returned
* and a warning is logged — email ingestion is never blocked by this function.
*
* @param emlBuffer The raw .eml file as a Buffer.
* @returns A new Buffer with non-inline attachments removed, or the original if nothing was stripped.
*/
export async function stripAttachmentsFromEml(emlBuffer: Buffer): Promise<Buffer> {
try {
const parsed = await simpleParser(emlBuffer);
// If there are no attachments at all, return early
if (!parsed.attachments || parsed.attachments.length === 0) {
return emlBuffer;
}
// Build the set of cid values referenced in the HTML body
const htmlBody = parsed.html || '';
const referencedCids = extractCidReferences(htmlBody);
// Check if there's anything to strip
const hasStrippableAttachments = parsed.attachments.some(
(a) => !isInlineAttachment(a, referencedCids)
);
if (!hasStrippableAttachments) {
return emlBuffer;
}
// Build the list of inline attachments to preserve in the .eml
const inlineAttachments: Mail.Attachment[] = [];
for (const attachment of parsed.attachments) {
if (isInlineAttachment(attachment, referencedCids)) {
inlineAttachments.push({
content: attachment.content,
contentType: attachment.contentType,
contentDisposition: 'inline' as const,
filename: attachment.filename || undefined,
cid: attachment.cid || undefined,
});
}
}
// Collect additional headers not handled by MailComposer's dedicated fields
const additionalHeaders = extractAdditionalHeaders(parsed.headers);
// Build the mail options for MailComposer
const mailOptions: Mail.Options = {
from: addressToString(parsed.from),
to: addressToString(parsed.to),
cc: addressToString(parsed.cc),
bcc: addressToString(parsed.bcc),
replyTo: addressToString(parsed.replyTo),
subject: parsed.subject,
messageId: parsed.messageId,
date: parsed.date,
inReplyTo: parsed.inReplyTo,
references: Array.isArray(parsed.references)
? parsed.references.join(' ')
: parsed.references,
text: parsed.text || undefined,
html: parsed.html || undefined,
attachments: inlineAttachments,
headers: additionalHeaders,
};
const composer = new MailComposer(mailOptions);
const builtMessage = composer.compile();
const stream = builtMessage.createReadStream();
return await new Promise<Buffer>((resolve, reject) => {
const chunks: Buffer[] = [];
stream.on('data', (chunk: Buffer) => chunks.push(chunk));
stream.on('end', () => resolve(Buffer.concat(chunks)));
stream.on('error', reject);
});
} catch (error) {
// If stripping fails, return the original buffer unchanged.
// Email ingestion should never be blocked by an attachment-stripping failure.
logger.warn(
{ error },
'Failed to strip non-inline attachments from .eml — storing original.'
);
return emlBuffer;
}
}

View File

@@ -15,6 +15,7 @@ import { ingestionQueue } from '../jobs/queues';
import type { JobType } from 'bullmq';
import { StorageService } from './StorageService';
import type { IInitialImportJob, EmailObject } from '@open-archiver/types';
import { stripAttachmentsFromEml } from '../helpers/emlUtils';
import {
archivedEmails,
attachments as attachmentsSchema,
@@ -446,7 +447,10 @@ export class IngestionService {
return null;
}
const emlBuffer = email.eml ?? Buffer.from(email.body, 'utf-8');
const rawEmlBuffer = email.eml ?? Buffer.from(email.body, 'utf-8');
// Strip non-inline attachments from the .eml to avoid double-storing
// attachment data (attachments are stored separately).
const emlBuffer = await stripAttachmentsFromEml(rawEmlBuffer);
const emailHash = createHash('sha256').update(emlBuffer).digest('hex');
const sanitizedPath = email.path ? email.path : '';
const emailPath = `${config.storage.openArchiverFolderName}/${source.name.replaceAll(' ', '-')}-${source.id}/emails/${sanitizedPath}${email.id}.eml`;

24
pnpm-lock.yaml generated
View File

@@ -167,6 +167,9 @@ importers:
multer:
specifier: ^2.0.2
version: 2.0.2
nodemailer:
specifier: ^8.0.2
version: 8.0.2
pdf2json:
specifier: ^3.1.6
version: 3.1.6
@@ -225,6 +228,9 @@ importers:
'@types/node':
specifier: ^24.0.12
version: 24.0.13
'@types/nodemailer':
specifier: ^7.0.11
version: 7.0.11
'@types/yauzl':
specifier: ^2.10.3
version: 2.10.3
@@ -1883,6 +1889,9 @@ packages:
'@types/node@24.0.13':
resolution: {integrity: sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==}
'@types/nodemailer@7.0.11':
resolution: {integrity: sha512-E+U4RzR2dKrx+u3N4DlsmLaDC6mMZOM/TPROxA0UAPiTgI0y4CEFBmZE+coGWTjakDriRsXG368lNk1u9Q0a2g==}
'@types/qs@6.14.0':
resolution: {integrity: sha512-eOunJqu0K1923aExK6y8p6fsihYEn/BYuQ4g0CxAAgFc4b/ZLN4CrsRZ55srTdqoiLzU2B2evC+apEIxprEzkQ==}
@@ -2974,11 +2983,12 @@ packages:
glob@10.4.5:
resolution: {integrity: sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==}
deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me
hasBin: true
glob@7.2.3:
resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==}
deprecated: Glob versions prior to v9 are no longer supported
deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me
google-auth-library@10.1.0:
resolution: {integrity: sha512-GspVjZj1RbyRWpQ9FbAXMKjFGzZwDKnUHi66JJ+tcjcu5/xYAP1pdlWotCuIkMwjfVsxxDvsGZXGLzRt72D0sQ==}
@@ -3413,6 +3423,7 @@ packages:
mailsplit@5.4.5:
resolution: {integrity: sha512-oMfhmvclR689IIaQmIcR5nODnZRRVwAKtqFT407TIvmhX2OLUBnshUTcxzQBt3+96sZVDud9NfSe1NxAkUNXEQ==}
deprecated: This package has been renamed to @zone-eu/mailsplit. Please update your dependencies.
make-error@1.3.6:
resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==}
@@ -3663,6 +3674,10 @@ packages:
resolution: {integrity: sha512-nsrh2lO3j4GkLLXoeEksAMgAOqxOv6QumNRVQTJwKH4nuiww6iC2y7GyANs9kRAxCexg3+lTWM3PZ91iLlVjfg==}
engines: {node: '>=6.0.0'}
nodemailer@8.0.2:
resolution: {integrity: sha512-zbj002pZAIkWQFxyAaqoxvn+zoIwRnS40hgjqTXudKOOJkiFFgBeNqjgD3/YCR12sZnrghWYBY+yP1ZucdDRpw==}
engines: {node: '>=6.0.0'}
nopt@5.0.0:
resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==}
engines: {node: '>=6'}
@@ -3844,6 +3859,7 @@ packages:
prebuild-install@7.1.3:
resolution: {integrity: sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==}
engines: {node: '>=10'}
deprecated: No longer maintained. Please contact the author of the relevant native addon; alternatives are available.
hasBin: true
prettier-plugin-svelte@3.4.0:
@@ -6476,6 +6492,10 @@ snapshots:
dependencies:
undici-types: 7.8.0
'@types/nodemailer@7.0.11':
dependencies:
'@types/node': 24.0.13
'@types/qs@6.14.0': {}
'@types/range-parser@1.2.7': {}
@@ -8390,6 +8410,8 @@ snapshots:
nodemailer@7.0.5: {}
nodemailer@8.0.2: {}
nopt@5.0.0:
dependencies:
abbrev: 1.1.1