aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/package.json6
-rw-r--r--apps/workers/workers/crawlerWorker.ts67
2 files changed, 39 insertions, 34 deletions
diff --git a/apps/workers/package.json b/apps/workers/package.json
index c888350e..2ed6f9df 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -4,7 +4,7 @@
"version": "0.1.0",
"private": true,
"dependencies": {
- "@ghostery/adblocker-puppeteer": "^2.5.1",
+ "@ghostery/adblocker-playwright": "^2.5.1",
"@karakeep/db": "workspace:^0.1.0",
"@karakeep/shared": "workspace:^0.1.0",
"@karakeep/trpc": "workspace:^0.1.0",
@@ -36,8 +36,8 @@
"pdf2json": "^3.1.5",
"pdf2pic": "^3.1.3",
"pdfjs-dist": "^4.2.67",
- "puppeteer": "^22.0.0",
- "puppeteer-extra": "^3.3.6",
+ "playwright": "^1.42.1",
+ "playwright-extra": "^4.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"rss-parser": "^3.13.0",
"tesseract.js": "^5.1.1",
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index f0e831c2..d884d149 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -2,8 +2,7 @@ import * as dns from "dns";
import { promises as fs } from "fs";
import * as path from "node:path";
import * as os from "os";
-import type { Browser } from "puppeteer";
-import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer";
+import { PlaywrightBlocker } from "@ghostery/adblocker-playwright";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import DOMPurify from "dompurify";
@@ -25,7 +24,8 @@ import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import fetch from "node-fetch";
-import puppeteer from "puppeteer-extra";
+import { Browser } from "playwright";
+import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
@@ -81,38 +81,37 @@ const metascraperParser = metascraper([
]);
let globalBrowser: Browser | undefined;
-let globalBlocker: PuppeteerBlocker | undefined;
+let globalBlocker: PlaywrightBlocker | undefined;
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();
async function startBrowserInstance() {
- const defaultViewport = {
- width: 1440,
- height: 900,
- };
if (serverConfig.crawler.browserWebSocketUrl) {
logger.info(
`[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
);
- return puppeteer.connect({
- browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
- defaultViewport,
+ return await chromium.connect(serverConfig.crawler.browserWebSocketUrl, {
+ // Important: using slowMo to ensure stability with remote browser
+ slowMo: 100,
+ timeout: 5000,
});
} else if (serverConfig.crawler.browserWebUrl) {
logger.info(
`[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
+
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
- // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
- return puppeteer.connect({
- browserURL: webUrl.toString(),
- defaultViewport,
+
+ return await chromium.connectOverCDP(webUrl.toString(), {
+ // Important: using slowMo to ensure stability with remote browser
+ slowMo: 100,
+ timeout: 5000,
});
} else {
logger.info(`Running in browserless mode`);
@@ -141,12 +140,12 @@ async function launchBrowser() {
globalBrowser?.on("disconnected", () => {
if (isShuttingDown) {
logger.info(
- "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
+ "[Crawler] The Playwright browser got disconnected. But we're shutting down so won't restart it.",
);
return;
}
logger.info(
- "[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.",
+ "[Crawler] The Playwright browser got disconnected. Will attempt to launch it again.",
);
launchBrowser();
});
@@ -155,11 +154,11 @@ async function launchBrowser() {
export class CrawlerWorker {
static async build() {
- puppeteer.use(StealthPlugin());
+ chromium.use(StealthPlugin());
if (serverConfig.crawler.enableAdblocker) {
try {
logger.info("[crawler] Loading adblocker ...");
- globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, {
+ globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(fetch, {
path: path.join(os.tmpdir(), "karakeep_adblocker.bin"),
read: fs.readFile,
write: fs.writeFile,
@@ -287,39 +286,45 @@ async function crawlPage(
if (!browser) {
return browserlessCrawlPage(jobId, url, abortSignal);
}
- const context = await browser.createBrowserContext();
+ const context = await browser.newContext({
+ viewport: { width: 1440, height: 900 },
+ userAgent:
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+ });
try {
+ // Create a new page in the context
const page = await context.newPage();
+
+ // Apply ad blocking
if (globalBlocker) {
await globalBlocker.enableBlockingInPage(page);
}
- await page.setUserAgent(
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
- );
+ // Navigate to the target URL
+ logger.info(`[Crawler][${jobId}] Navigating to "${url}"`);
const response = await page.goto(url, {
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
+ waitUntil: "domcontentloaded",
});
+
logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
);
- // Wait until there's at most two connections for 2 seconds
- // Attempt to wait only for 5 seconds
+ // Wait until network is relatively idle or timeout after 5 seconds
await Promise.race([
- page.waitForNetworkIdle({
- idleTime: 1000, // 1 sec
- concurrency: 2,
- }),
- new Promise((f) => setTimeout(f, 5000)),
+ page.waitForLoadState("networkidle", { timeout: 5000 }).catch(() => ({})),
+ new Promise((resolve) => setTimeout(resolve, 5000)),
]);
logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);
+ // Extract content from the page
const htmlContent = await page.content();
logger.info(`[Crawler][${jobId}] Successfully fetched the page content.`);
+ // Take a screenshot if configured
let screenshot: Buffer | undefined = undefined;
if (serverConfig.crawler.storeScreenshot) {
try {
@@ -327,7 +332,6 @@ async function crawlPage(
page.screenshot({
// If you change this, you need to change the asset type in the store function.
type: "png",
- encoding: "binary",
fullPage: serverConfig.crawler.fullPageScreenshot,
}),
new Promise((_, reject) =>
@@ -358,6 +362,7 @@ async function crawlPage(
};
} finally {
await context.close();
+ // Only close the browser if it was created on demand
if (serverConfig.crawler.browserConnectOnDemand) {
await browser.close();
}