From 86728d7f0e48e5cf8fbd1977240909302b2d8ad9 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Mon, 21 Apr 2025 03:15:14 +0000 Subject: fix(workers): Fix dompurify to run on readability's input not output --- apps/workers/crawlerWorker.ts | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'apps/workers') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 0bfb109c..9401088e 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -391,13 +391,21 @@ function extractReadableContent( logger.info( `[Crawler][${jobId}] Will attempt to extract readable content ...`, ); + const dom = new JSDOM(htmlContent, { url }); + const readableContent = new Readability(dom.window.document).parse(); + if (!readableContent || typeof readableContent.content !== "string") { + return null; + } + const window = new JSDOM("").window; const purify = DOMPurify(window); - const purifiedHTML = purify.sanitize(htmlContent); - const purifiedDOM = new JSDOM(purifiedHTML, { url }); - const readableContent = new Readability(purifiedDOM.window.document).parse(); + const purifiedHTML = purify.sanitize(readableContent.content); + logger.info(`[Crawler][${jobId}] Done extracting readable content.`); - return readableContent; + return { + content: purifiedHTML, + textContent: readableContent.textContent, + }; } async function storeScreenshot( -- cgit v1.2.3-70-g09d2