aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/crawlerWorker.ts16
1 files changed, 12 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 0bfb109c..9401088e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -391,13 +391,21 @@ function extractReadableContent(
logger.info(
`[Crawler][${jobId}] Will attempt to extract readable content ...`,
);
+ const dom = new JSDOM(htmlContent, { url });
+ const readableContent = new Readability(dom.window.document).parse();
+ if (!readableContent || typeof readableContent.content !== "string") {
+ return null;
+ }
+
const window = new JSDOM("").window;
const purify = DOMPurify(window);
- const purifiedHTML = purify.sanitize(htmlContent);
- const purifiedDOM = new JSDOM(purifiedHTML, { url });
- const readableContent = new Readability(purifiedDOM.window.document).parse();
+ const purifiedHTML = purify.sanitize(readableContent.content);
+
logger.info(`[Crawler][${jobId}] Done extracting readable content.`);
- return readableContent;
+ return {
+ content: purifiedHTML,
+ textContent: readableContent.textContent,
+ };
}
async function storeScreenshot(