aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--apps/workers/crawlerWorker.ts28
-rw-r--r--docs/docs/03-configuration.md1
-rw-r--r--packages/shared/config.ts2
3 files changed, 21 insertions, 10 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 5798b98c..e75a8586 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -322,23 +322,31 @@ async function crawlPage(
let screenshot: Buffer | undefined = undefined;
if (serverConfig.crawler.storeScreenshot) {
- screenshot = await Promise.race<Buffer | undefined>([
- page
- .screenshot({
+ try {
+ screenshot = await Promise.race<Buffer>([
+ page.screenshot({
// If you change this, you need to change the asset type in the store function.
type: "png",
encoding: "binary",
fullPage: serverConfig.crawler.fullPageScreenshot,
- })
- .catch(() => undefined),
- new Promise((f) => setTimeout(f, 5000)),
- ]);
- if (!screenshot) {
- logger.warn(`[Crawler][${jobId}] Failed to capture the screenshot.`);
- } else {
+ }),
+ new Promise((_, reject) =>
+ setTimeout(
+ () =>
+ reject(
+ "TIMED_OUT, consider increasing CRAWLER_SCREENSHOT_TIMEOUT_SEC",
+ ),
+ serverConfig.crawler.screenshotTimeoutSec * 1000,
+ ),
+ ),
+ ]);
logger.info(
`[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`,
);
+ } catch (e) {
+ logger.warn(
+ `[Crawler][${jobId}] Failed to capture the screenshot. Reason: ${e}`,
+ );
}
}
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 6e11774e..235462ef 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -81,6 +81,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. |
| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page |
+| CRAWLER_SCREENSHOT_TIMEOUT_SEC | No | 5 | How long to wait for the screenshot finish before timing out. If you are capturing full-page screenshots of long webpages, consider increasing this value. |
| CRAWLER_FULL_PAGE_ARCHIVE | No | false | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived. |
| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 12578b1f..55c13df5 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -54,6 +54,7 @@ const allEnv = z.object({
.string()
.default("")
.transform((t) => t.split("%%").filter((a) => a)),
+ CRAWLER_SCREENSHOT_TIMEOUT_SEC: z.coerce.number().default(5),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
@@ -123,6 +124,7 @@ const serverConfigSchema = allEnv.transform((val) => {
downloadVideoTimeout: val.CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC,
enableAdblocker: val.CRAWLER_ENABLE_ADBLOCKER,
ytDlpArguments: val.CRAWLER_YTDLP_ARGS,
+ screenshotTimeoutSec: val.CRAWLER_SCREENSHOT_TIMEOUT_SEC,
},
ocr: {
langs: val.OCR_LANGS,