aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/crawlerWorker.ts50
1 files changed, 40 insertions, 10 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index eb4a0697..282f5f43 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,10 +1,11 @@
import assert from "assert";
+import * as dns from "dns";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import { Job, Worker } from "bullmq";
import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
-import { isShuttingDown, shutdownPromise } from "exit";
+import { isShuttingDown } from "exit";
import { JSDOM } from "jsdom";
import metascraper from "metascraper";
import metascraperDescription from "metascraper-description";
@@ -50,11 +51,38 @@ const browserMutex = new Mutex();
async function launchBrowser() {
browser = undefined;
await browserMutex.runExclusive(async () => {
- browser = await puppeteer.launch({
- headless: serverConfig.crawler.headlessBrowser,
- executablePath: serverConfig.crawler.browserExecutablePath,
- userDataDir: serverConfig.crawler.browserUserDataDir,
- });
+ try {
+ if (serverConfig.crawler.browserWebUrl) {
+ logger.info(
+ `Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
+ );
+ const webUrl = new URL(serverConfig.crawler.browserWebUrl);
+ // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
+ const { address: address } = await dns.promises.lookup(webUrl.hostname);
+ webUrl.hostname = address;
+ logger.info(
+ `Successfully resolved IP address, new address: ${webUrl.toString()}`,
+ );
+ browser = await puppeteer.connect({
+ browserURL: webUrl.toString(),
+ });
+ } else {
+ logger.info(`Launching a new browser instance`);
+ browser = await puppeteer.launch({
+ headless: serverConfig.crawler.headlessBrowser,
+ executablePath: serverConfig.crawler.browserExecutablePath,
+ userDataDir: serverConfig.crawler.browserUserDataDir,
+ });
+ }
+ } catch (e) {
+ logger.error(
+ "Failed to connect to the browser instance, will retry in 5 secs",
+ );
+ setTimeout(() => {
+ launchBrowser();
+ }, 5000);
+ return;
+ }
browser.on("disconnected", async () => {
if (isShuttingDown) {
logger.info(
@@ -91,13 +119,15 @@ export class CrawlerWorker {
);
worker.on("completed", (job) => {
- const jobId = job?.id || "unknown";
+ const jobId = job?.id ?? "unknown";
logger.info(`[Crawler][${jobId}] Completed successfully`);
});
worker.on("failed", (job, error) => {
- const jobId = job?.id || "unknown";
- logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
+ const jobId = job?.id ?? "unknown";
+ logger.error(
+ `[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`,
+ );
});
return worker;
@@ -160,7 +190,7 @@ async function crawlPage(url: string) {
}
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
- const jobId = job.id || "unknown";
+ const jobId = job.id ?? "unknown";
const request = zCrawlLinkRequestSchema.safeParse(job.data);
if (!request.success) {