From be1b7f7e1c0cb3d905e13aa1a95e295b816cbdeb Mon Sep 17 00:00:00 2001 From: kamtschatka Date: Sat, 22 Jun 2024 18:52:40 +0200 Subject: feature: add support for PDF links. Fixes #28 (#216) * feature request: pdf support #28 Added a new sourceUrl column to the asset bookmarks Added transforming a link bookmark pointing at a pdf to an asset bookmark made sure the "View Original" link is also shown for asset bookmarks that have a sourceURL updated gitignore for IDEA * remove pdf parsing from the crawler * extract the http logic into its own function to avoid duplicating the post-processing actions (openai/index) * Add 5s timeout to the content type fetch --------- Co-authored-by: MohamedBassem --- apps/workers/crawlerWorker.ts | 220 +++++++++++++++++++++++++++++++----------- 1 file changed, 163 insertions(+), 57 deletions(-) (limited to 'apps/workers/crawlerWorker.ts') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 58f1aa85..eedb7b1e 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -1,5 +1,6 @@ import assert from "assert"; import * as dns from "dns"; +import * as path from "node:path"; import type { Job } from "bullmq"; import type { Browser } from "puppeteer"; import { Readability } from "@mozilla/readability"; @@ -26,8 +27,9 @@ import { withTimeout } from "utils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarkLinks, bookmarks } from "@hoarder/db/schema"; +import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema"; import { + ASSET_TYPES, deleteAsset, newAssetId, saveAsset, @@ -68,7 +70,7 @@ async function startBrowserInstance() { logger.info( `[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`, ); - return await puppeteer.connect({ + return puppeteer.connect({ browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl, defaultViewport, }); @@ -83,13 +85,13 @@ async function startBrowserInstance() { logger.info( `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`, ); - return await puppeteer.connect({ + return puppeteer.connect({ browserURL: webUrl.toString(), defaultViewport, }); } else { logger.info(`Launching a new browser instance`); - return await puppeteer.launch({ + return puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser, defaultViewport, }); @@ -271,7 +273,11 @@ async function crawlPage(jobId: string, url: string) { logger.info( `[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`, ); - return { htmlContent, screenshot, url: page.url() }; + return { + htmlContent, + screenshot, + url: page.url(), + }; } finally { await context.close(); } @@ -337,22 +343,17 @@ async function storeScreenshot( return assetId; } -async function downloadAndStoreImage( +async function downloadAndStoreFile( url: string, userId: string, jobId: string, + fileType: string, ) { - if (!serverConfig.crawler.downloadBannerImage) { - logger.info( - `[Crawler][${jobId}] Skipping downloading the image as per the config.`, - ); - return null; - } try { - logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`); + logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`); const response = await fetch(url); if (!response.ok) { - throw new Error(`Failed to download image: ${response.status}`); + throw new Error(`Failed to download ${fileType}: ${response.status}`); } const buffer = await response.arrayBuffer(); const assetId = newAssetId(); @@ -370,18 +371,32 @@ async function downloadAndStoreImage( }); logger.info( - `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`, + `[Crawler][${jobId}] Downloaded ${fileType} as assetId: ${assetId}`, ); return assetId; } catch (e) { logger.error( - `[Crawler][${jobId}] Failed to download and store image: ${e}`, + `[Crawler][${jobId}] Failed to download and store ${fileType}: ${e}`, ); return null; } } +async function downloadAndStoreImage( + url: string, + userId: string, + jobId: string, +) { + if (!serverConfig.crawler.downloadBannerImage) { + logger.info( + `[Crawler][${jobId}] Skipping downloading the image as per the config.`, + ); + return null; + } + return downloadAndStoreFile(url, userId, jobId, "image"); +} + async function archiveWebpage( html: string, url: string, @@ -415,31 +430,70 @@ async function archiveWebpage( return assetId; } -async function runCrawler(job: Job) { - const jobId = job.id ?? "unknown"; - - const request = zCrawlLinkRequestSchema.safeParse(job.data); - if (!request.success) { +async function getContentType( + url: string, + jobId: string, +): Promise { + try { + logger.info( + `[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`, + ); + const response = await fetch(url, { + method: "HEAD", + signal: AbortSignal.timeout(5000), + }); + const contentType = response.headers.get("content-type"); + logger.info( + `[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`, + ); + return contentType; + } catch (e) { logger.error( - `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, + `[Crawler][${jobId}] Failed to determine the content-type for the url ${url}: ${e}`, ); - return; + return null; } +} - const { bookmarkId } = request.data; - const { - url, - userId, - screenshotAssetId: oldScreenshotAssetId, - imageAssetId: oldImageAssetId, - fullPageArchiveAssetId: oldFullPageArchiveAssetId, - } = await getBookmarkDetails(bookmarkId); - - logger.info( - `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, - ); - validateUrl(url); +/** + * Downloads the pdf asset from the URL and transforms the linkBookmark to an assetBookmark + * @param url the url the user provided + * @param userId the id of the user + * @param jobId the id of the job for logging + * @param bookmarkId the id of the bookmark + */ +async function handlePDFAsAssetBookmark( + url: string, + userId: string, + jobId: string, + bookmarkId: string, +) { + const assetId = await downloadAndStoreFile(url, userId, jobId, "pdf"); + if (!assetId) { + return; + } + await db.transaction(async (trx) => { + await trx.insert(bookmarkAssets).values({ + id: bookmarkId, + assetType: "pdf", + assetId, + content: null, + fileName: path.basename(new URL(url).pathname), + sourceUrl: url, + }); + await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId)); + }); +} +async function crawlAndParseUrl( + url: string, + userId: string, + jobId: string, + bookmarkId: string, + oldScreenshotAssetId: string | null, + oldImageAssetId: string | null, + oldFullPageArchiveAssetId: string | null, +) { const { htmlContent, screenshot, @@ -482,6 +536,78 @@ async function runCrawler(job: Job) { : {}, ]); + return async () => { + if (serverConfig.crawler.fullPageArchive) { + const fullPageArchiveAssetId = await archiveWebpage( + htmlContent, + browserUrl, + userId, + jobId, + ); + + await db + .update(bookmarkLinks) + .set({ + fullPageArchiveAssetId, + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + + if (oldFullPageArchiveAssetId) { + deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( + () => ({}), + ); + } + } + }; +} + +async function runCrawler(job: Job) { + const jobId = job.id ?? "unknown"; + + const request = zCrawlLinkRequestSchema.safeParse(job.data); + if (!request.success) { + logger.error( + `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, + ); + return; + } + + const { bookmarkId } = request.data; + const { + url, + userId, + screenshotAssetId: oldScreenshotAssetId, + imageAssetId: oldImageAssetId, + fullPageArchiveAssetId: oldFullPageArchiveAssetId, + } = await getBookmarkDetails(bookmarkId); + + logger.info( + `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, + ); + validateUrl(url); + + const contentType = await getContentType(url, jobId); + + // Link bookmarks get transformed into asset bookmarks if they point to a pdf asset instead of a webpage + const isPdf = contentType === ASSET_TYPES.APPLICATION_PDF; + + let archivalLogic: () => Promise = () => { + return Promise.resolve(); + }; + if (isPdf) { + await handlePDFAsAssetBookmark(url, userId, jobId, bookmarkId); + } else { + archivalLogic = await crawlAndParseUrl( + url, + userId, + jobId, + bookmarkId, + oldScreenshotAssetId, + oldImageAssetId, + oldFullPageArchiveAssetId, + ); + } + // Enqueue openai job (if not set, assume it's true for backward compatibility) if (job.data.runInference !== false) { OpenAIQueue.add("openai", { @@ -493,25 +619,5 @@ async function runCrawler(job: Job) { triggerSearchReindex(bookmarkId); // Do the archival as a separate last step as it has the potential for failure - if (serverConfig.crawler.fullPageArchive) { - const fullPageArchiveAssetId = await archiveWebpage( - htmlContent, - browserUrl, - userId, - jobId, - ); - - await db - .update(bookmarkLinks) - .set({ - fullPageArchiveAssetId, - }) - .where(eq(bookmarkLinks.id, bookmarkId)); - - if (oldFullPageArchiveAssetId) { - deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( - () => ({}), - ); - } - } + await archivalLogic(); } -- cgit v1.2.3-70-g09d2