aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/docker.yml2
-rw-r--r--apps/workers/crawlerWorker.ts50
-rw-r--r--docker/Dockerfile9
-rw-r--r--docker/Dockerfile.dev7
-rw-r--r--docker/docker-compose.dev.yml9
-rw-r--r--docker/docker-compose.yml9
-rw-r--r--docker/start-chrome.sh7
-rw-r--r--packages/shared/config.ts1
8 files changed, 61 insertions, 33 deletions
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 134842e5..72a82d33 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -49,6 +49,6 @@ jobs:
target: ${{ matrix.package }}
platforms: linux/amd64,linux/arm64
push: true
- tags: ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:${{github.event.release.name}},ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:release
+ tags: ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:${{ github.event.release.name }},ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:release
cache-from: type=registry,ref=ghcr.io/mohamedbassem/hoarder-build-cache:${{ matrix.package }}
cache-to: type=registry,mode=max,ref=ghcr.io/mohamedbassem/hoarder-build-cache:${{ matrix.package }}
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index eb4a0697..282f5f43 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,10 +1,11 @@
import assert from "assert";
+import * as dns from "dns";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import { Job, Worker } from "bullmq";
import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
-import { isShuttingDown, shutdownPromise } from "exit";
+import { isShuttingDown } from "exit";
import { JSDOM } from "jsdom";
import metascraper from "metascraper";
import metascraperDescription from "metascraper-description";
@@ -50,11 +51,38 @@ const browserMutex = new Mutex();
async function launchBrowser() {
browser = undefined;
await browserMutex.runExclusive(async () => {
- browser = await puppeteer.launch({
- headless: serverConfig.crawler.headlessBrowser,
- executablePath: serverConfig.crawler.browserExecutablePath,
- userDataDir: serverConfig.crawler.browserUserDataDir,
- });
+ try {
+ if (serverConfig.crawler.browserWebUrl) {
+ logger.info(
+ `Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
+ );
+ const webUrl = new URL(serverConfig.crawler.browserWebUrl);
+ // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
+ const { address: address } = await dns.promises.lookup(webUrl.hostname);
+ webUrl.hostname = address;
+ logger.info(
+ `Successfully resolved IP address, new address: ${webUrl.toString()}`,
+ );
+ browser = await puppeteer.connect({
+ browserURL: webUrl.toString(),
+ });
+ } else {
+ logger.info(`Launching a new browser instance`);
+ browser = await puppeteer.launch({
+ headless: serverConfig.crawler.headlessBrowser,
+ executablePath: serverConfig.crawler.browserExecutablePath,
+ userDataDir: serverConfig.crawler.browserUserDataDir,
+ });
+ }
+ } catch (e) {
+ logger.error(
+ "Failed to connect to the browser instance, will retry in 5 secs",
+ );
+ setTimeout(() => {
+ launchBrowser();
+ }, 5000);
+ return;
+ }
browser.on("disconnected", async () => {
if (isShuttingDown) {
logger.info(
@@ -91,13 +119,15 @@ export class CrawlerWorker {
);
worker.on("completed", (job) => {
- const jobId = job?.id || "unknown";
+ const jobId = job?.id ?? "unknown";
logger.info(`[Crawler][${jobId}] Completed successfully`);
});
worker.on("failed", (job, error) => {
- const jobId = job?.id || "unknown";
- logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
+ const jobId = job?.id ?? "unknown";
+ logger.error(
+ `[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`,
+ );
});
return worker;
@@ -160,7 +190,7 @@ async function crawlPage(url: string) {
}
async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
- const jobId = job.id || "unknown";
+ const jobId = job.id ?? "unknown";
const request = zCrawlLinkRequestSchema.safeParse(job.data);
if (!request.success) {
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2164dc77..05432cbe 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -72,19 +72,10 @@ RUN --mount=type=cache,id=pnpm_workers,target=/pnpm/store pnpm deploy --node-lin
FROM --platform=$BUILDPLATFORM node:21-alpine AS workers
WORKDIR /app
-# Install chromium needed for puppeteer
-RUN apk add --no-cache chromium runuser
-ENV CHROME_PATH "/usr/bin/chromium-browser"
-ENV BROWSER_EXECUTABLE_PATH "/app/start-chrome.sh"
-ENV BROWSER_USER_DATA_DIR="/tmp/chrome"
-
COPY --from=workers_builder /prod apps/workers
RUN corepack enable
-ADD docker/start-chrome.sh .
-RUN chmod +x start-chrome.sh
-
WORKDIR /app/apps/workers
USER root
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
index 9a8de32b..cd15b20d 100644
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -1,11 +1,6 @@
FROM node:21-alpine
-RUN apk add --no-cache libc6-compat chromium runuser make g++ py3-pip linux-headers
+RUN apk add --no-cache libc6-compat make g++ py3-pip linux-headers
ENV PUPPETEER_SKIP_DOWNLOAD true
-ENV CHROME_PATH "/usr/bin/chromium-browser"
-ENV BROWSER_EXECUTABLE_PATH "/bin/start-chrome.sh"
-ENV BROWSER_USER_DATA_DIR="/tmp/chrome"
WORKDIR /app
-ADD start-chrome.sh /bin
-RUN chmod +x /bin/start-chrome.sh
diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml
index d7cbbbf0..80547930 100644
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@@ -23,6 +23,14 @@ services:
image: redis:7.2-alpine
volumes:
- redis:/data
+ chrome:
+ image: gcr.io/zenika-hub/alpine-chrome:100
+ restart: unless-stopped
+ command:
+ - --no-sandbox
+ - --disable-gpu
+ - --remote-debugging-address=0.0.0.0
+ - --remote-debugging-port=9222
meilisearch:
image: getmeili/meilisearch:v1.6
volumes:
@@ -37,6 +45,7 @@ services:
environment:
REDIS_HOST: redis
MEILI_ADDR: http://meilisearch:7700
+ BROWSER_WEB_URL: http://chrome:9222
DATA_DIR: /data
# OPENAI_API_KEY: ...
command:
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 03cb5a82..51c564b8 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -18,6 +18,14 @@ services:
restart: unless-stopped
volumes:
- redis:/data
+ chrome:
+ image: gcr.io/zenika-hub/alpine-chrome:100
+ restart: unless-stopped
+ command:
+ - --no-sandbox
+ - --disable-gpu
+ - --remote-debugging-address=0.0.0.0
+ - --remote-debugging-port=9222
meilisearch:
image: getmeili/meilisearch:v1.6
restart: unless-stopped
@@ -35,6 +43,7 @@ services:
environment:
REDIS_HOST: redis
MEILI_ADDR: http://meilisearch:7700
+ BROWSER_WEB_URL: http://chrome:9222
DATA_DIR: /data
# OPENAI_API_KEY: ...
depends_on:
diff --git a/docker/start-chrome.sh b/docker/start-chrome.sh
deleted file mode 100644
index 9f715906..00000000
--- a/docker/start-chrome.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/sh
-
-set -x;
-id -u chrome &>/dev/null || adduser -S chrome;
-mkdir -p $BROWSER_USER_DATA_DIR;
-chown chrome $BROWSER_USER_DATA_DIR;
-runuser -u chrome -- $CHROME_PATH --no-sandbox $@;
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 25806ae0..e12c55c2 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -14,6 +14,7 @@ const serverConfig = {
headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true",
browserExecutablePath: process.env.BROWSER_EXECUTABLE_PATH, // If not set, the system's browser will be used
browserUserDataDir: process.env.BROWSER_USER_DATA_DIR,
+ browserWebUrl: process.env.BROWSER_WEB_URL,
},
meilisearch: process.env.MEILI_ADDR
? {