From d33be149e661945fe67a9b6c4ff0d1e47917b8cd Mon Sep 17 00:00:00 2001 From: kamtschatka Date: Sun, 12 May 2024 14:06:41 +0200 Subject: feature: Take full page screenshots #143 (#148) Added the fullPage flag to take full screen screenshots updated the UI accordingly to properly show the screenshots instead of scaling it down Co-authored-by: kamtschatka --- apps/web/components/dashboard/preview/LinkContentSection.tsx | 6 ++++-- apps/workers/crawlerWorker.ts | 3 ++- docs/docs/03-configuration.md | 1 + packages/shared/config.ts | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx index 6c51864f..29001c7f 100644 --- a/apps/web/components/dashboard/preview/LinkContentSection.tsx +++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx @@ -16,10 +16,12 @@ function ScreenshotSection({ link }: { link: ZBookmarkedLink }) { return (
screenshot
); diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 6b4d39f0..e7ed854b 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -231,10 +231,11 @@ async function crawlPage(jobId: string, url: string) { // If you change this, you need to change the asset type in the store function. type: "png", encoding: "binary", + fullPage: serverConfig.crawler.fullPageScreenshot, }), ]); logger.info( - `[Crawler][${jobId}] Finished capturing page content and a screenshot.`, + `[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`, ); return { htmlContent, screenshot, url: page.url() }; } finally { diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 3d44f359..47bd115a 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -42,5 +42,6 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin | CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. | | CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. | | CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. | +| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page | | CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | | CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | diff --git a/packages/shared/config.ts b/packages/shared/config.ts index bdd58936..aae14a07 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -25,6 +25,7 @@ const allEnv = z.object({ CRAWLER_NUM_WORKERS: z.coerce.number().default(1), CRAWLER_DOWNLOAD_BANNER_IMAGE: stringBool("true"), CRAWLER_STORE_SCREENSHOT: stringBool("true"), + CRAWLER_FULL_PAGE_SCREENSHOT: stringBool("false"), MEILI_ADDR: z.string().optional(), MEILI_MASTER_KEY: z.string().default(""), LOG_LEVEL: z.string().default("debug"), @@ -66,6 +67,7 @@ const serverConfigSchema = allEnv.transform((val) => { navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC, downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE, storeScreenshot: val.CRAWLER_STORE_SCREENSHOT, + fullPageScreenshot: val.CRAWLER_FULL_PAGE_SCREENSHOT, }, meilisearch: val.MEILI_ADDR ? { -- cgit v1.2.3-70-g09d2