diff options
| author | Mohamed Bassem <me@mbassem.com> | 2024-11-30 18:05:04 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2024-11-30 18:05:04 +0000 |
| commit | 18101001d334c96772ee56ec82ede33790f34fa8 (patch) | |
| tree | 6b28e44768ebf50d53b2db2586e18c1ce357bf03 /apps | |
| parent | 5a496916c386bf1bd31ee841b76cb28d855716b9 (diff) | |
| download | karakeep-18101001d334c96772ee56ec82ede33790f34fa8.tar.zst | |
feature(workers): Allow running hoarder without chrome as a hard dependency. Fixes #650
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 46 |
1 files changed, 35 insertions, 11 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 376e50ea..3952a287 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -1,4 +1,3 @@ -import assert from "assert"; import * as dns from "dns"; import { promises as fs } from "fs"; import * as path from "node:path"; @@ -104,11 +103,8 @@ async function startBrowserInstance() { defaultViewport, }); } else { - logger.info(`Launching a new browser instance`); - return puppeteer.launch({ - headless: serverConfig.crawler.headlessBrowser, - defaultViewport, - }); + logger.info(`Running in browserless mode`); + return undefined; } } @@ -130,7 +126,7 @@ async function launchBrowser() { }, 5000); return; } - globalBrowser.on("disconnected", () => { + globalBrowser?.on("disconnected", () => { if (isShuttingDown) { logger.info( "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.", @@ -238,15 +234,43 @@ function validateUrl(url: string) { } } -async function crawlPage(jobId: string, url: string) { - let browser: Browser; +async function browserlessCrawlPage(jobId: string, url: string) { + logger.info( + `[Crawler][${jobId}] Running in browserless mode. Will do a plain http request to "${url}". Screenshots will be disabled.`, + ); + const response = await fetch(url, { + signal: AbortSignal.timeout(5000), + }); + if (!response.ok) { + throw new Error(`Failed to crawl page: ${response.status}`); + } + logger.info( + `[Crawler][${jobId}] Successfully fetched the content of "${url}". Status: ${response.status}, Size: ${response.size}`, + ); + return { + htmlContent: await response.text(), + screenshot: undefined, + url: response.url, + }; +} + +async function crawlPage( + jobId: string, + url: string, +): Promise<{ + htmlContent: string; + screenshot: Buffer | undefined; + url: string; +}> { + let browser: Browser | undefined; if (serverConfig.crawler.browserConnectOnDemand) { browser = await startBrowserInstance(); } else { - assert(globalBrowser); browser = globalBrowser; } - assert(browser); + if (!browser) { + return browserlessCrawlPage(jobId, url); + } const context = await browser.createBrowserContext(); try { |
