diff options
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/web/components/dashboard/preview/BookmarkPreview.tsx | 16 | ||||
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 220 |
2 files changed, 177 insertions, 59 deletions
diff --git a/apps/web/components/dashboard/preview/BookmarkPreview.tsx b/apps/web/components/dashboard/preview/BookmarkPreview.tsx index be11b47b..6a1068af 100644 --- a/apps/web/components/dashboard/preview/BookmarkPreview.tsx +++ b/apps/web/components/dashboard/preview/BookmarkPreview.tsx @@ -65,6 +65,16 @@ function CreationTime({ createdAt }: { createdAt: Date }) { ); } +function getSourceUrl(bookmark: ZBookmark) { + if (bookmark.content.type === "link") { + return bookmark.content.url; + } + if (bookmark.content.type === "asset") { + return bookmark.content.sourceUrl; + } + return null; +} + export default function BookmarkPreview({ bookmarkId, initialData, @@ -112,6 +122,8 @@ export default function BookmarkPreview({ } } + const sourceUrl = getSourceUrl(bookmark); + return ( <div className="grid h-full grid-rows-3 gap-2 overflow-hidden bg-background lg:grid-cols-3 lg:grid-rows-none"> <div className="row-span-2 h-full w-full overflow-auto p-2 md:col-span-2 lg:row-auto"> @@ -120,9 +132,9 @@ export default function BookmarkPreview({ <div className="lg:col-span1 row-span-1 flex flex-col gap-4 overflow-auto bg-accent p-4 lg:row-auto"> <div className="flex w-full flex-col items-center justify-center gap-y-2"> <EditableTitle bookmark={bookmark} /> - {bookmark.content.type == "link" && ( + {sourceUrl && ( <Link - href={bookmark.content.url} + href={sourceUrl} className="flex items-center gap-2 text-gray-400" > <span>View Original</span> diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 58f1aa85..eedb7b1e 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -1,5 +1,6 @@ import assert from "assert"; import * as dns from "dns"; +import * as path from "node:path"; import type { Job } from "bullmq"; import type { Browser } from "puppeteer"; import { Readability } from "@mozilla/readability"; @@ -26,8 +27,9 @@ import { withTimeout } from "utils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarkLinks, bookmarks } from "@hoarder/db/schema"; +import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema"; import { + ASSET_TYPES, deleteAsset, newAssetId, saveAsset, @@ -68,7 +70,7 @@ async function startBrowserInstance() { logger.info( `[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`, ); - return await puppeteer.connect({ + return puppeteer.connect({ browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl, defaultViewport, }); @@ -83,13 +85,13 @@ async function startBrowserInstance() { logger.info( `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`, ); - return await puppeteer.connect({ + return puppeteer.connect({ browserURL: webUrl.toString(), defaultViewport, }); } else { logger.info(`Launching a new browser instance`); - return await puppeteer.launch({ + return puppeteer.launch({ headless: serverConfig.crawler.headlessBrowser, defaultViewport, }); @@ -271,7 +273,11 @@ async function crawlPage(jobId: string, url: string) { logger.info( `[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`, ); - return { htmlContent, screenshot, url: page.url() }; + return { + htmlContent, + screenshot, + url: page.url(), + }; } finally { await context.close(); } @@ -337,22 +343,17 @@ async function storeScreenshot( return assetId; } -async function downloadAndStoreImage( +async function downloadAndStoreFile( url: string, userId: string, jobId: string, + fileType: string, ) { - if (!serverConfig.crawler.downloadBannerImage) { - logger.info( - `[Crawler][${jobId}] Skipping downloading the image as per the config.`, - ); - return null; - } try { - logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`); + logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`); const response = await fetch(url); if (!response.ok) { - throw new Error(`Failed to download image: ${response.status}`); + throw new Error(`Failed to download ${fileType}: ${response.status}`); } const buffer = await response.arrayBuffer(); const assetId = newAssetId(); @@ -370,18 +371,32 @@ async function downloadAndStoreImage( }); logger.info( - `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`, + `[Crawler][${jobId}] Downloaded ${fileType} as assetId: ${assetId}`, ); return assetId; } catch (e) { logger.error( - `[Crawler][${jobId}] Failed to download and store image: ${e}`, + `[Crawler][${jobId}] Failed to download and store ${fileType}: ${e}`, ); return null; } } +async function downloadAndStoreImage( + url: string, + userId: string, + jobId: string, +) { + if (!serverConfig.crawler.downloadBannerImage) { + logger.info( + `[Crawler][${jobId}] Skipping downloading the image as per the config.`, + ); + return null; + } + return downloadAndStoreFile(url, userId, jobId, "image"); +} + async function archiveWebpage( html: string, url: string, @@ -415,31 +430,70 @@ async function archiveWebpage( return assetId; } -async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { - const jobId = job.id ?? "unknown"; - - const request = zCrawlLinkRequestSchema.safeParse(job.data); - if (!request.success) { +async function getContentType( + url: string, + jobId: string, +): Promise<string | null> { + try { + logger.info( + `[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`, + ); + const response = await fetch(url, { + method: "HEAD", + signal: AbortSignal.timeout(5000), + }); + const contentType = response.headers.get("content-type"); + logger.info( + `[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`, + ); + return contentType; + } catch (e) { logger.error( - `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, + `[Crawler][${jobId}] Failed to determine the content-type for the url ${url}: ${e}`, ); - return; + return null; } +} - const { bookmarkId } = request.data; - const { - url, - userId, - screenshotAssetId: oldScreenshotAssetId, - imageAssetId: oldImageAssetId, - fullPageArchiveAssetId: oldFullPageArchiveAssetId, - } = await getBookmarkDetails(bookmarkId); - - logger.info( - `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, - ); - validateUrl(url); +/** + * Downloads the pdf asset from the URL and transforms the linkBookmark to an assetBookmark + * @param url the url the user provided + * @param userId the id of the user + * @param jobId the id of the job for logging + * @param bookmarkId the id of the bookmark + */ +async function handlePDFAsAssetBookmark( + url: string, + userId: string, + jobId: string, + bookmarkId: string, +) { + const assetId = await downloadAndStoreFile(url, userId, jobId, "pdf"); + if (!assetId) { + return; + } + await db.transaction(async (trx) => { + await trx.insert(bookmarkAssets).values({ + id: bookmarkId, + assetType: "pdf", + assetId, + content: null, + fileName: path.basename(new URL(url).pathname), + sourceUrl: url, + }); + await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId)); + }); +} +async function crawlAndParseUrl( + url: string, + userId: string, + jobId: string, + bookmarkId: string, + oldScreenshotAssetId: string | null, + oldImageAssetId: string | null, + oldFullPageArchiveAssetId: string | null, +) { const { htmlContent, screenshot, @@ -482,6 +536,78 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { : {}, ]); + return async () => { + if (serverConfig.crawler.fullPageArchive) { + const fullPageArchiveAssetId = await archiveWebpage( + htmlContent, + browserUrl, + userId, + jobId, + ); + + await db + .update(bookmarkLinks) + .set({ + fullPageArchiveAssetId, + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + + if (oldFullPageArchiveAssetId) { + deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( + () => ({}), + ); + } + } + }; +} + +async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { + const jobId = job.id ?? "unknown"; + + const request = zCrawlLinkRequestSchema.safeParse(job.data); + if (!request.success) { + logger.error( + `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, + ); + return; + } + + const { bookmarkId } = request.data; + const { + url, + userId, + screenshotAssetId: oldScreenshotAssetId, + imageAssetId: oldImageAssetId, + fullPageArchiveAssetId: oldFullPageArchiveAssetId, + } = await getBookmarkDetails(bookmarkId); + + logger.info( + `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, + ); + validateUrl(url); + + const contentType = await getContentType(url, jobId); + + // Link bookmarks get transformed into asset bookmarks if they point to a pdf asset instead of a webpage + const isPdf = contentType === ASSET_TYPES.APPLICATION_PDF; + + let archivalLogic: () => Promise<void> = () => { + return Promise.resolve(); + }; + if (isPdf) { + await handlePDFAsAssetBookmark(url, userId, jobId, bookmarkId); + } else { + archivalLogic = await crawlAndParseUrl( + url, + userId, + jobId, + bookmarkId, + oldScreenshotAssetId, + oldImageAssetId, + oldFullPageArchiveAssetId, + ); + } + // Enqueue openai job (if not set, assume it's true for backward compatibility) if (job.data.runInference !== false) { OpenAIQueue.add("openai", { @@ -493,25 +619,5 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { triggerSearchReindex(bookmarkId); // Do the archival as a separate last step as it has the potential for failure - if (serverConfig.crawler.fullPageArchive) { - const fullPageArchiveAssetId = await archiveWebpage( - htmlContent, - browserUrl, - userId, - jobId, - ); - - await db - .update(bookmarkLinks) - .set({ - fullPageArchiveAssetId, - }) - .where(eq(bookmarkLinks.id, bookmarkId)); - - if (oldFullPageArchiveAssetId) { - deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( - () => ({}), - ); - } - } + await archivalLogic(); } |
