diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-04-21 03:15:14 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-04-21 03:15:14 +0000 |
| commit | 86728d7f0e48e5cf8fbd1977240909302b2d8ad9 (patch) | |
| tree | d90ad7001f4928b57313e04030c7f3b7d1f562a8 /apps/workers | |
| parent | f257a5ba95c62d10faee3d33bd97307a3f2f60c6 (diff) | |
| download | karakeep-86728d7f0e48e5cf8fbd1977240909302b2d8ad9.tar.zst | |
fix(workers): Fix dompurify to run on readability's input not output
Diffstat (limited to 'apps/workers')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 0bfb109c..9401088e 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -391,13 +391,21 @@ function extractReadableContent( logger.info( `[Crawler][${jobId}] Will attempt to extract readable content ...`, ); + const dom = new JSDOM(htmlContent, { url }); + const readableContent = new Readability(dom.window.document).parse(); + if (!readableContent || typeof readableContent.content !== "string") { + return null; + } + const window = new JSDOM("").window; const purify = DOMPurify(window); - const purifiedHTML = purify.sanitize(htmlContent); - const purifiedDOM = new JSDOM(purifiedHTML, { url }); - const readableContent = new Readability(purifiedDOM.window.document).parse(); + const purifiedHTML = purify.sanitize(readableContent.content); + logger.info(`[Crawler][${jobId}] Done extracting readable content.`); - return readableContent; + return { + content: purifiedHTML, + textContent: readableContent.textContent, + }; } async function storeScreenshot( |
