From ff676ecb86ec1b4bd1c9c94399bc8320dc8d3a06 Mon Sep 17 00:00:00 2001 From: Til Wegener <38760774+tilwegener@users.noreply.github.com> Date: Wed, 13 Aug 2025 20:11:47 +0200 Subject: [PATCH] * avoid hanging when pdf2json fails by resolving text extraction with an empty string --- packages/backend/src/helpers/textExtractor.ts | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/packages/backend/src/helpers/textExtractor.ts b/packages/backend/src/helpers/textExtractor.ts index b51c775..48038e7 100644 --- a/packages/backend/src/helpers/textExtractor.ts +++ b/packages/backend/src/helpers/textExtractor.ts @@ -3,17 +3,31 @@ import mammoth from 'mammoth'; import xlsx from 'xlsx'; function extractTextFromPdf(buffer: Buffer): Promise { - return new Promise((resolve, reject) => { + return new Promise((resolve) => { const pdfParser = new PDFParser(null, true); + let completed = false; - pdfParser.on('pdfParser_dataError', (errData: any) => - reject(new Error(errData.parserError)) + const finish = (text: string) => { + if (completed) return; + completed = true; + pdfParser.removeAllListeners(); + resolve(text); + }; + + pdfParser.on('pdfParser_dataError', () => finish('')); + pdfParser.on('pdfParser_dataReady', () => + finish(pdfParser.getRawTextContent()) ); - pdfParser.on('pdfParser_dataReady', () => { - resolve(pdfParser.getRawTextContent()); - }); - pdfParser.parseBuffer(buffer); + try { + pdfParser.parseBuffer(buffer); + } catch (err) { + console.error('Error parsing PDF buffer', err); + finish(''); + } + + // Prevent hanging if the parser never emits events + setTimeout(() => finish(''), 10000); }); }