aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-04-21 03:15:14 +0000
committerMohamed Bassem <me@mbassem.com>2025-04-21 03:15:14 +0000
commit86728d7f0e48e5cf8fbd1977240909302b2d8ad9 (patch)
treed90ad7001f4928b57313e04030c7f3b7d1f562a8 /apps
parentf257a5ba95c62d10faee3d33bd97307a3f2f60c6 (diff)
downloadkarakeep-86728d7f0e48e5cf8fbd1977240909302b2d8ad9.tar.zst
fix(workers): Fix dompurify to run on readability's input not output
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/crawlerWorker.ts16
1 files changed, 12 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 0bfb109c..9401088e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -391,13 +391,21 @@ function extractReadableContent(
logger.info(
`[Crawler][${jobId}] Will attempt to extract readable content ...`,
);
+ const dom = new JSDOM(htmlContent, { url });
+ const readableContent = new Readability(dom.window.document).parse();
+ if (!readableContent || typeof readableContent.content !== "string") {
+ return null;
+ }
+
const window = new JSDOM("").window;
const purify = DOMPurify(window);
- const purifiedHTML = purify.sanitize(htmlContent);
- const purifiedDOM = new JSDOM(purifiedHTML, { url });
- const readableContent = new Readability(purifiedDOM.window.document).parse();
+ const purifiedHTML = purify.sanitize(readableContent.content);
+
logger.info(`[Crawler][${jobId}] Done extracting readable content.`);
- return readableContent;
+ return {
+ content: purifiedHTML,
+ textContent: readableContent.textContent,
+ };
}
async function storeScreenshot(