aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts46
1 files changed, 35 insertions, 11 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 376e50ea..3952a287 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,4 +1,3 @@
-import assert from "assert";
import * as dns from "dns";
import { promises as fs } from "fs";
import * as path from "node:path";
@@ -104,11 +103,8 @@ async function startBrowserInstance() {
defaultViewport,
});
} else {
- logger.info(`Launching a new browser instance`);
- return puppeteer.launch({
- headless: serverConfig.crawler.headlessBrowser,
- defaultViewport,
- });
+ logger.info(`Running in browserless mode`);
+ return undefined;
}
}
@@ -130,7 +126,7 @@ async function launchBrowser() {
}, 5000);
return;
}
- globalBrowser.on("disconnected", () => {
+ globalBrowser?.on("disconnected", () => {
if (isShuttingDown) {
logger.info(
"[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
@@ -238,15 +234,43 @@ function validateUrl(url: string) {
}
}
-async function crawlPage(jobId: string, url: string) {
- let browser: Browser;
+async function browserlessCrawlPage(jobId: string, url: string) {
+ logger.info(
+ `[Crawler][${jobId}] Running in browserless mode. Will do a plain http request to "${url}". Screenshots will be disabled.`,
+ );
+ const response = await fetch(url, {
+ signal: AbortSignal.timeout(5000),
+ });
+ if (!response.ok) {
+ throw new Error(`Failed to crawl page: ${response.status}`);
+ }
+ logger.info(
+ `[Crawler][${jobId}] Successfully fetched the content of "${url}". Status: ${response.status}, Size: ${response.size}`,
+ );
+ return {
+ htmlContent: await response.text(),
+ screenshot: undefined,
+ url: response.url,
+ };
+}
+
+async function crawlPage(
+ jobId: string,
+ url: string,
+): Promise<{
+ htmlContent: string;
+ screenshot: Buffer | undefined;
+ url: string;
+}> {
+ let browser: Browser | undefined;
if (serverConfig.crawler.browserConnectOnDemand) {
browser = await startBrowserInstance();
} else {
- assert(globalBrowser);
browser = globalBrowser;
}
- assert(browser);
+ if (!browser) {
+ return browserlessCrawlPage(jobId, url);
+ }
const context = await browser.createBrowserContext();
try {