diff options
| author | MohamedBassem <me@mbassem.com> | 2024-04-19 20:01:51 +0100 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2024-04-20 00:05:31 +0100 |
| commit | 4402e6f04170cbb0613d35fe94471162253e91b2 (patch) | |
| tree | 696f6511cefa7d1c6bc3a1f8bc2de755870310cc /apps | |
| parent | b4a13ce3d92ee505124fc98804935c1122978550 (diff) | |
| download | karakeep-4402e6f04170cbb0613d35fe94471162253e91b2.tar.zst | |
feature: Download images and screenshots
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/mobile/components/bookmarks/BookmarkCard.tsx | 69 | ||||
| -rw-r--r-- | apps/web/app/api/assets/route.ts | 4 | ||||
| -rw-r--r-- | apps/web/components/dashboard/bookmarks/AssetCard.tsx | 7 | ||||
| -rw-r--r-- | apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx | 2 | ||||
| -rw-r--r-- | apps/web/components/dashboard/bookmarks/LinkCard.tsx | 11 | ||||
| -rw-r--r-- | apps/web/components/dashboard/bookmarks/TextCard.tsx | 2 | ||||
| -rw-r--r-- | apps/web/components/dashboard/preview/BookmarkPreview.tsx | 14 | ||||
| -rw-r--r-- | apps/web/components/dashboard/preview/LinkContentSection.tsx | 77 | ||||
| -rw-r--r-- | apps/web/components/dashboard/preview/TextContentSection.tsx | 40 | ||||
| -rw-r--r-- | apps/web/lib/bookmarkUtils.tsx | 22 | ||||
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 158 |
11 files changed, 275 insertions, 131 deletions
diff --git a/apps/mobile/components/bookmarks/BookmarkCard.tsx b/apps/mobile/components/bookmarks/BookmarkCard.tsx index 6662e76a..c995d593 100644 --- a/apps/mobile/components/bookmarks/BookmarkCard.tsx +++ b/apps/mobile/components/bookmarks/BookmarkCard.tsx @@ -21,33 +21,17 @@ import { useDeleteBookmark, useUpdateBookmark, } from "@hoarder/shared-react/hooks/bookmarks"; +import { + getBookmarkLinkImageUrl, + isBookmarkStillLoading, + isBookmarkStillTagging, +} from "@hoarder/shared-react/utils/bookmarkUtils"; import { TailwindResolver } from "../TailwindResolver"; import { Divider } from "../ui/Divider"; import { Skeleton } from "../ui/Skeleton"; import { useToast } from "../ui/Toast"; -const MAX_LOADING_MSEC = 30 * 1000; - -export function isBookmarkStillCrawling(bookmark: ZBookmark) { - return ( - bookmark.content.type === "link" && - !bookmark.content.crawledAt && - Date.now().valueOf() - bookmark.createdAt.valueOf() < MAX_LOADING_MSEC - ); -} - -export function isBookmarkStillTagging(bookmark: ZBookmark) { - return ( - bookmark.taggingStatus === "pending" && - Date.now().valueOf() - bookmark.createdAt.valueOf() < MAX_LOADING_MSEC - ); -} - -export function isBookmarkStillLoading(bookmark: ZBookmark) { - return isBookmarkStillTagging(bookmark) || isBookmarkStillCrawling(bookmark); -} - function ActionBar({ bookmark }: { bookmark: ZBookmark }) { const { toast } = useToast(); @@ -176,6 +160,7 @@ function TagList({ bookmark }: { bookmark: ZBookmark }) { } function LinkCard({ bookmark }: { bookmark: ZBookmark }) { + const { settings } = useAppSettings(); if (bookmark.content.type !== "link") { throw new Error("Wrong content type rendered"); } @@ -183,18 +168,36 @@ function LinkCard({ bookmark }: { bookmark: ZBookmark }) { const url = bookmark.content.url; const parsedUrl = new URL(url); - const imageComp = bookmark.content.imageUrl ? ( - <Image - source={{ uri: bookmark.content.imageUrl }} - className="h-56 min-h-56 w-full object-cover" - /> - ) : ( - <Image - // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment - source={require("@/assets/blur.jpeg")} - className="h-56 w-full rounded-t-lg" - /> - ); + const imageUrl = getBookmarkLinkImageUrl(bookmark.content); + + let imageComp; + if (imageUrl) { + imageComp = ( + <Image + source={ + imageUrl.localAsset + ? { + uri: `${settings.address}${imageUrl.url}`, + headers: { + Authorization: `Bearer ${settings.apiKey}`, + }, + } + : { + uri: imageUrl.url, + } + } + className="h-56 min-h-56 w-full object-cover" + /> + ); + } else { + imageComp = ( + <Image + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + source={require("@/assets/blur.jpeg")} + className="h-56 w-full rounded-t-lg" + /> + ); + } return ( <View className="flex gap-2"> diff --git a/apps/web/app/api/assets/route.ts b/apps/web/app/api/assets/route.ts index c77751d3..a1ebea0f 100644 --- a/apps/web/app/api/assets/route.ts +++ b/apps/web/app/api/assets/route.ts @@ -2,7 +2,7 @@ import { createContextFromRequest } from "@/server/api/client"; import { TRPCError } from "@trpc/server"; import type { ZUploadResponse } from "@hoarder/shared/types/uploads"; -import { saveAsset } from "@hoarder/shared/assetdb"; +import { newAssetId, saveAsset } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; const SUPPORTED_ASSET_TYPES = new Set([ @@ -46,7 +46,7 @@ export async function POST(request: Request) { return Response.json({ error: "Bad request" }, { status: 400 }); } - const assetId = crypto.randomUUID(); + const assetId = newAssetId(); const fileName = data.name; await saveAsset({ diff --git a/apps/web/components/dashboard/bookmarks/AssetCard.tsx b/apps/web/components/dashboard/bookmarks/AssetCard.tsx index c9a43575..40f435de 100644 --- a/apps/web/components/dashboard/bookmarks/AssetCard.tsx +++ b/apps/web/components/dashboard/bookmarks/AssetCard.tsx @@ -1,13 +1,14 @@ "use client"; import Image from "next/image"; -import { isBookmarkStillTagging } from "@/lib/bookmarkUtils"; import { api } from "@/lib/trpc"; import type { ZBookmark, ZBookmarkTypeAsset, } from "@hoarder/shared/types/bookmarks"; +import { getAssetUrl } from "@hoarder/shared-react/utils/assetUtils"; +import { isBookmarkStillTagging } from "@hoarder/shared-react/utils/bookmarkUtils"; import { BookmarkLayoutAdaptingCard } from "./BookmarkLayoutAdaptingCard"; @@ -24,7 +25,7 @@ function AssetImage({ return ( <Image alt="asset" - src={`/api/assets/${bookmarkedAsset.assetId}`} + src={getAssetUrl(bookmarkedAsset.assetId)} fill={true} className={className} /> @@ -35,7 +36,7 @@ function AssetImage({ <iframe title={bookmarkedAsset.assetId} className={className} - src={`/api/assets/${bookmarkedAsset.assetId}`} + src={getAssetUrl(bookmarkedAsset.assetId)} /> ); } diff --git a/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx b/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx index 42c4db21..d282c3f4 100644 --- a/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx +++ b/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx @@ -1,7 +1,6 @@ import type { BookmarksLayoutTypes } from "@/lib/userLocalSettings/types"; import React from "react"; import Link from "next/link"; -import { isBookmarkStillTagging } from "@/lib/bookmarkUtils"; import { bookmarkLayoutSwitch, useBookmarkLayout, @@ -10,6 +9,7 @@ import { cn } from "@/lib/utils"; import dayjs from "dayjs"; import type { ZBookmark } from "@hoarder/shared/types/bookmarks"; +import { isBookmarkStillTagging } from "@hoarder/shared-react/utils/bookmarkUtils"; import BookmarkActionBar from "./BookmarkActionBar"; import TagList from "./TagList"; diff --git a/apps/web/components/dashboard/bookmarks/LinkCard.tsx b/apps/web/components/dashboard/bookmarks/LinkCard.tsx index ef0ae6f2..3bb1698f 100644 --- a/apps/web/components/dashboard/bookmarks/LinkCard.tsx +++ b/apps/web/components/dashboard/bookmarks/LinkCard.tsx @@ -1,13 +1,14 @@ "use client"; import Link from "next/link"; -import { - isBookmarkStillCrawling, - isBookmarkStillLoading, -} from "@/lib/bookmarkUtils"; import { api } from "@/lib/trpc"; import type { ZBookmarkTypeLink } from "@hoarder/shared/types/bookmarks"; +import { + getBookmarkLinkImageUrl, + isBookmarkStillCrawling, + isBookmarkStillLoading, +} from "@hoarder/shared-react/utils/bookmarkUtils"; import { BookmarkLayoutAdaptingCard } from "./BookmarkLayoutAdaptingCard"; @@ -33,7 +34,7 @@ function LinkImage({ // A dummy white pixel for when there's no image. // TODO: Better handling for cards with no images const image = - link.imageUrl ?? + getBookmarkLinkImageUrl(link)?.url ?? ""; return ( <Link href={link.url} target="_blank"> diff --git a/apps/web/components/dashboard/bookmarks/TextCard.tsx b/apps/web/components/dashboard/bookmarks/TextCard.tsx index 9d5c8d8b..74b3e8e5 100644 --- a/apps/web/components/dashboard/bookmarks/TextCard.tsx +++ b/apps/web/components/dashboard/bookmarks/TextCard.tsx @@ -1,13 +1,13 @@ "use client"; import { useState } from "react"; -import { isBookmarkStillTagging } from "@/lib/bookmarkUtils"; import { api } from "@/lib/trpc"; import { bookmarkLayoutSwitch } from "@/lib/userLocalSettings/bookmarksLayout"; import { cn } from "@/lib/utils"; import Markdown from "react-markdown"; import type { ZBookmark } from "@hoarder/shared/types/bookmarks"; +import { isBookmarkStillTagging } from "@hoarder/shared-react/utils/bookmarkUtils"; import { BookmarkedTextViewer } from "./BookmarkedTextViewer"; import { BookmarkLayoutAdaptingCard } from "./BookmarkLayoutAdaptingCard"; diff --git a/apps/web/components/dashboard/preview/BookmarkPreview.tsx b/apps/web/components/dashboard/preview/BookmarkPreview.tsx index 29e8e39a..581ec4bd 100644 --- a/apps/web/components/dashboard/preview/BookmarkPreview.tsx +++ b/apps/web/components/dashboard/preview/BookmarkPreview.tsx @@ -11,20 +11,21 @@ import { TooltipPortal, TooltipTrigger, } from "@/components/ui/tooltip"; -import { - isBookmarkStillCrawling, - isBookmarkStillLoading, -} from "@/lib/bookmarkUtils"; import { api } from "@/lib/trpc"; import dayjs from "dayjs"; import relativeTime from "dayjs/plugin/relativeTime"; import { CalendarDays, ExternalLink } from "lucide-react"; import type { ZBookmark } from "@hoarder/shared/types/bookmarks"; +import { + isBookmarkStillCrawling, + isBookmarkStillLoading, +} from "@hoarder/shared-react/utils/bookmarkUtils"; import ActionBar from "./ActionBar"; import { AssetContentSection } from "./AssetContentSection"; import { EditableTitle } from "./EditableTitle"; +import LinkContentSection from "./LinkContentSection"; import { NoteEditor } from "./NoteEditor"; import { TextContentSection } from "./TextContentSection"; @@ -90,7 +91,10 @@ export default function BookmarkPreview({ let content; switch (bookmark.content.type) { - case "link": + case "link": { + content = <LinkContentSection bookmark={bookmark} />; + break; + } case "text": { content = <TextContentSection bookmark={bookmark} />; break; diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx new file mode 100644 index 00000000..ff08c661 --- /dev/null +++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx @@ -0,0 +1,77 @@ +import { useState } from "react"; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { ScrollArea } from "@radix-ui/react-scroll-area"; + +import { ZBookmark, ZBookmarkedLink } from "@hoarder/shared/types/bookmarks"; + +function ScreenshotSection({ link }: { link: ZBookmarkedLink }) { + // eslint-disable-next-line @next/next/no-img-element + return <img alt="screenshot" src={`/api/assets/${link.screenshotAssetId}`} />; +} + +function CachedContentSection({ link }: { link: ZBookmarkedLink }) { + let content; + if (!link.htmlContent) { + content = ( + <div className="text-destructive">Failed to fetch link content ...</div> + ); + } else { + content = ( + <div + dangerouslySetInnerHTML={{ + __html: link.htmlContent || "", + }} + className="prose mx-auto dark:prose-invert" + /> + ); + } + return content; +} + +export default function LinkContentSection({ + bookmark, +}: { + bookmark: ZBookmark; +}) { + const [section, setSection] = useState<string>("cached"); + + if (bookmark.content.type != "link") { + throw new Error("Invalid content type"); + } + + let content; + if (section === "cached") { + content = <CachedContentSection link={bookmark.content} />; + } else { + content = <ScreenshotSection link={bookmark.content} />; + } + + return ( + <div className="flex flex-col items-center gap-2"> + <Select onValueChange={setSection} value={section}> + <SelectTrigger className="w-fit"> + <SelectValue /> + </SelectTrigger> + <SelectContent> + <SelectGroup> + <SelectItem value="cached">Cached Content</SelectItem> + <SelectItem + value="screenshot" + disabled={!bookmark.content.screenshotAssetId} + > + Screenshot + </SelectItem> + </SelectGroup> + </SelectContent> + </Select> + <ScrollArea className="h-full">{content}</ScrollArea> + </div> + ); +} diff --git a/apps/web/components/dashboard/preview/TextContentSection.tsx b/apps/web/components/dashboard/preview/TextContentSection.tsx index a73ad722..eba7d28b 100644 --- a/apps/web/components/dashboard/preview/TextContentSection.tsx +++ b/apps/web/components/dashboard/preview/TextContentSection.tsx @@ -4,36 +4,14 @@ import Markdown from "react-markdown"; import type { ZBookmark } from "@hoarder/shared/types/bookmarks"; export function TextContentSection({ bookmark }: { bookmark: ZBookmark }) { - let content; - switch (bookmark.content.type) { - case "link": { - if (!bookmark.content.htmlContent) { - content = ( - <div className="text-destructive"> - Failed to fetch link content ... - </div> - ); - } else { - content = ( - <div - dangerouslySetInnerHTML={{ - __html: bookmark.content.htmlContent || "", - }} - className="prose mx-auto dark:prose-invert" - /> - ); - } - break; - } - case "text": { - content = ( - <Markdown className="prose mx-auto dark:prose-invert"> - {bookmark.content.text} - </Markdown> - ); - break; - } + if (bookmark.content.type != "text") { + throw new Error("Invalid content type"); } - - return <ScrollArea className="h-full">{content}</ScrollArea>; + return ( + <ScrollArea className="h-full"> + <Markdown className="prose mx-auto dark:prose-invert"> + {bookmark.content.text} + </Markdown> + </ScrollArea> + ); } diff --git a/apps/web/lib/bookmarkUtils.tsx b/apps/web/lib/bookmarkUtils.tsx deleted file mode 100644 index 475ba383..00000000 --- a/apps/web/lib/bookmarkUtils.tsx +++ /dev/null @@ -1,22 +0,0 @@ -import type { ZBookmark } from "@hoarder/shared/types/bookmarks"; - -const MAX_LOADING_MSEC = 30 * 1000; - -export function isBookmarkStillCrawling(bookmark: ZBookmark) { - return ( - bookmark.content.type == "link" && - !bookmark.content.crawledAt && - Date.now().valueOf() - bookmark.createdAt.valueOf() < MAX_LOADING_MSEC - ); -} - -export function isBookmarkStillTagging(bookmark: ZBookmark) { - return ( - bookmark.taggingStatus == "pending" && - Date.now().valueOf() - bookmark.createdAt.valueOf() < MAX_LOADING_MSEC - ); -} - -export function isBookmarkStillLoading(bookmark: ZBookmark) { - return isBookmarkStillTagging(bookmark) || isBookmarkStillCrawling(bookmark); -} diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 91b0a03f..27e9e14c 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -24,7 +24,8 @@ import { withTimeout } from "utils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarkLinks } from "@hoarder/db/schema"; +import { bookmarkLinks, bookmarks } from "@hoarder/db/schema"; +import { newAssetId, saveAsset } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; import logger from "@hoarder/shared/logger"; import { @@ -155,15 +156,16 @@ async function changeBookmarkStatus( .where(eq(bookmarkLinks.id, bookmarkId)); } -async function getBookmarkUrl(bookmarkId: string) { - const bookmark = await db.query.bookmarkLinks.findFirst({ - where: eq(bookmarkLinks.id, bookmarkId), +async function getBookmarkDetails(bookmarkId: string) { + const bookmark = await db.query.bookmarks.findFirst({ + where: eq(bookmarks.id, bookmarkId), + with: { link: true }, }); - if (!bookmark) { + if (!bookmark || !bookmark.link) { throw new Error("The bookmark either doesn't exist or not a link"); } - return bookmark.url; + return { url: bookmark.link.url, userId: bookmark.userId }; } /** @@ -208,13 +210,116 @@ async function crawlPage(jobId: string, url: string) { logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`); - const htmlContent = await page.content(); - return htmlContent; + const [htmlContent, screenshot] = await Promise.all([ + page.content(), + page.screenshot({ + // If you change this, you need to change the asset type in the store function. + type: "png", + encoding: "binary", + }), + ]); + logger.info( + `[Crawler][${jobId}] Finished capturing page content and a screenshot.`, + ); + return { htmlContent, screenshot, url: page.url() }; } finally { await context.close(); } } +async function extractMetadata( + htmlContent: string, + url: string, + jobId: string, +) { + logger.info( + `[Crawler][${jobId}] Will attempt to extract metadata from page ...`, + ); + const meta = await metascraperParser({ + url, + html: htmlContent, + // We don't want to validate the URL again as we've already done it by visiting the page. + // This was added because URL validation fails if the URL ends with a question mark (e.g. empty query params). + validateUrl: false, + }); + logger.info(`[Crawler][${jobId}] Done extracting metadata from the page.`); + return meta; +} + +function extractReadableContent( + htmlContent: string, + url: string, + jobId: string, +) { + logger.info( + `[Crawler][${jobId}] Will attempt to extract readable content ...`, + ); + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const purifiedHTML = purify.sanitize(htmlContent); + const purifiedDOM = new JSDOM(purifiedHTML, { url }); + const readableContent = new Readability(purifiedDOM.window.document).parse(); + logger.info(`[Crawler][${jobId}] Done extracting readable content.`); + return readableContent; +} + +async function storeScreenshot( + screenshot: Buffer, + userId: string, + jobId: string, +) { + const assetId = newAssetId(); + await saveAsset({ + userId, + assetId, + metadata: { contentType: "image/png", fileName: "screenshot.png" }, + asset: screenshot, + }); + logger.info( + `[Crawler][${jobId}] Stored the screenshot as assetId: ${assetId}`, + ); + return assetId; +} + +async function downloadAndStoreImage( + url: string, + userId: string, + jobId: string, +) { + try { + logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`); + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to download image: ${response.status}`); + } + const buffer = await response.arrayBuffer(); + const assetId = newAssetId(); + + const contentType = response.headers.get("content-type"); + if (!contentType) { + throw new Error("No content type in the response"); + } + + await saveAsset({ + userId, + assetId, + metadata: { contentType }, + asset: Buffer.from(buffer), + }); + + logger.info( + `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`, + ); + + return assetId; + } catch (e) { + logger.error( + `[Crawler][${jobId}] Failed to download and store image: ${e}`, + ); + return null; + } +} + async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { const jobId = job.id ?? "unknown"; @@ -227,35 +332,30 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { } const { bookmarkId } = request.data; - const url = await getBookmarkUrl(bookmarkId); + const { url, userId } = await getBookmarkDetails(bookmarkId); logger.info( `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, ); validateUrl(url); - const htmlContent = await crawlPage(jobId, url); - - logger.info( - `[Crawler][${jobId}] Will attempt to parse the content of the page ...`, - ); - const meta = await metascraperParser({ - url, - html: htmlContent, - // We don't want to validate the URL again as we've already done it by visiting the page. - // This was added because URL validation fails if the URL ends with a question mark (e.g. empty query params). - validateUrl: false, - }); - logger.info(`[Crawler][${jobId}] Done parsing the content of the page.`); + const { + htmlContent, + screenshot, + url: browserUrl, + } = await crawlPage(jobId, url); - const window = new JSDOM("").window; - const purify = DOMPurify(window); - const purifiedHTML = purify.sanitize(htmlContent); - const purifiedDOM = new JSDOM(purifiedHTML, { url }); - const readableContent = new Readability(purifiedDOM.window.document).parse(); + const [meta, readableContent, screenshotAssetId] = await Promise.all([ + extractMetadata(htmlContent, browserUrl, jobId), + extractReadableContent(htmlContent, browserUrl, jobId), + storeScreenshot(screenshot, userId, jobId), + ]); + let imageAssetId: string | null = null; + if (meta.image) { + imageAssetId = await downloadAndStoreImage(meta.image, userId, jobId); + } // TODO(important): Restrict the size of content to store - await db .update(bookmarkLinks) .set({ @@ -265,6 +365,8 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { favicon: meta.logo, content: readableContent?.textContent, htmlContent: readableContent?.content, + screenshotAssetId, + imageAssetId, crawledAt: new Date(), }) .where(eq(bookmarkLinks.id, bookmarkId)); |
