aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--apps/workers/crawlerWorker.ts32
-rw-r--r--docs/docs/03-configuration.md6
-rw-r--r--packages/shared/config.ts2
3 files changed, 29 insertions, 11 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index a969ab86..cce409e5 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -56,14 +56,14 @@ async function launchBrowser() {
try {
if (serverConfig.crawler.browserWebUrl) {
logger.info(
- `Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
+ `[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address: address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
- `Successfully resolved IP address, new address: ${webUrl.toString()}`,
+ `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
browser = await puppeteer.connect({
browserURL: webUrl.toString(),
@@ -76,7 +76,7 @@ async function launchBrowser() {
}
} catch (e) {
logger.error(
- "Failed to connect to the browser instance, will retry in 5 secs",
+ "[Crawler] Failed to connect to the browser instance, will retry in 5 secs",
);
setTimeout(() => {
launchBrowser();
@@ -86,12 +86,12 @@ async function launchBrowser() {
browser.on("disconnected", () => {
if (isShuttingDown) {
logger.info(
- "The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
+ "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
);
return;
}
logger.info(
- "The puppeteer browser got disconnected. Will attempt to launch it again.",
+ "[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.",
);
launchBrowser();
});
@@ -111,7 +111,10 @@ export class CrawlerWorker {
logger.info("Starting crawler worker ...");
const worker = new Worker<ZCrawlLinkRequest, void>(
LinkCrawlerQueue.name,
- withTimeout(runCrawler, /* timeoutSec */ 30),
+ withTimeout(
+ runCrawler,
+ /* timeoutSec */ serverConfig.crawler.jobTimeoutSec,
+ ),
{
connection: queueConnectionDetails,
autorun: false,
@@ -125,9 +128,7 @@ export class CrawlerWorker {
worker.on("failed", (job, error) => {
const jobId = job?.id ?? "unknown";
- logger.error(
- `[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`,
- );
+ logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
});
return worker;
@@ -161,7 +162,7 @@ function validateUrl(url: string) {
}
}
-async function crawlPage(url: string) {
+async function crawlPage(jobId: string, url: string) {
assert(browser);
const context = await browser.createBrowserContext();
@@ -171,6 +172,9 @@ async function crawlPage(url: string) {
await page.goto(url, {
timeout: 10000, // 10 seconds
});
+ logger.info(
+ `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
+ );
// Wait until there's at most two connections for 2 seconds
// Attempt to wait only for 5 seconds
@@ -182,6 +186,8 @@ async function crawlPage(url: string) {
new Promise((f) => setTimeout(f, 5000)),
]);
+ logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);
+
const htmlContent = await page.content();
return htmlContent;
} finally {
@@ -208,12 +214,16 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
);
validateUrl(url);
- const htmlContent = await crawlPage(url);
+ const htmlContent = await crawlPage(jobId, url);
+ logger.info(
+ `[Crawler][${jobId}] Will attempt to parse the content of the page ...`,
+ );
const meta = await metascraperParser({
url,
html: htmlContent,
});
+ logger.info(`[Crawler][${jobId}] Done parsing the content of the page.`);
const window = new JSDOM("").window;
const purify = DOMPurify(window);
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 8bf8a069..1307bcfd 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -34,3 +34,9 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
| INFERENCE_TEXT_MODEL | No | gpt-3.5-turbo-0125 | The model to use for text inference. You'll need to change this to some other model if you're using ollama. |
| INFERENCE_IMAGE_MODEL | No | gpt-4-vision-preview | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava). |
| INFERENCE_LANG | No | english | The language in which the tags will be generated. |
+
+## Crawler Configs
+
+| Name | Required | Default | Description |
+| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 11140c3b..75274a4e 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -20,6 +20,7 @@ const allEnv = z.object({
REDIS_DB_IDX: z.coerce.number().optional(),
CRAWLER_HEADLESS_BROWSER: stringBool("true"),
BROWSER_WEB_URL: z.string().url().optional(),
+ CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
@@ -56,6 +57,7 @@ const serverConfigSchema = allEnv.transform((val) => {
crawler: {
headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
browserWebUrl: val.BROWSER_WEB_URL,
+ jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
},
meilisearch: val.MEILI_ADDR
? {