From 705d539c8e9c6a86882825ee4dabeff3027ba827 Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sat, 30 Nov 2024 19:12:45 +0000 Subject: feature: Store crawling status code and allow users to find broken links. Fixes #169 --- apps/web/app/settings/broken-links/page.tsx | 131 +++++++++++++++++++++++++ apps/web/components/settings/sidebar/items.tsx | 6 ++ apps/web/lib/i18n/locales/en/translation.json | 7 ++ apps/workers/crawlerWorker.ts | 10 +- 4 files changed, 150 insertions(+), 4 deletions(-) create mode 100644 apps/web/app/settings/broken-links/page.tsx (limited to 'apps') diff --git a/apps/web/app/settings/broken-links/page.tsx b/apps/web/app/settings/broken-links/page.tsx new file mode 100644 index 00000000..0b83dfa9 --- /dev/null +++ b/apps/web/app/settings/broken-links/page.tsx @@ -0,0 +1,131 @@ +"use client"; + +import { ActionButton } from "@/components/ui/action-button"; +import { FullPageSpinner } from "@/components/ui/full-page-spinner"; +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from "@/components/ui/table"; +import { toast } from "@/components/ui/use-toast"; +import { RefreshCw, Trash2 } from "lucide-react"; +import { useTranslation } from "react-i18next"; + +import { + useDeleteBookmark, + useRecrawlBookmark, +} from "@hoarder/shared-react/hooks/bookmarks"; +import { api } from "@hoarder/shared-react/trpc"; + +export default function BrokenLinksPage() { + const { t } = useTranslation(); + + const apiUtils = api.useUtils(); + const { data, isPending } = api.bookmarks.getBrokenLinks.useQuery(); + + const { mutate: deleteBookmark, isPending: isDeleting } = useDeleteBookmark({ + onSuccess: () => { + toast({ + description: t("toasts.bookmarks.deleted"), + }); + apiUtils.bookmarks.getBrokenLinks.invalidate(); + }, + onError: () => { + toast({ + description: t("common.something_went_wrong"), + variant: "destructive", + }); + }, + }); + + const { mutate: recrawlBookmark, isPending: isRecrawling } = + useRecrawlBookmark({ + onSuccess: () => { + toast({ + description: t("toasts.bookmarks.refetch"), + }); + apiUtils.bookmarks.getBrokenLinks.invalidate(); + }, + onError: () => { + toast({ + description: t("common.something_went_wrong"), + variant: "destructive", + }); + }, + }); + + return ( +
+
+
+ {t("settings.broken_links.broken_links")} +
+
+
+ {isPending && } + {!isPending && data && data.bookmarks.length == 0 && ( +

+ No broken links found +

+ )} + {!isPending && data && data.bookmarks.length > 0 && ( + + + + {t("common.url")} + {t("common.created_at")} + + {t("settings.broken_links.last_crawled_at")} + + + {t("settings.broken_links.crawling_status")} + + {t("common.action")} + + + + {data.bookmarks.map((b) => ( + + {b.url} + {b.createdAt?.toLocaleString()} + {b.crawledAt?.toLocaleString()} + + {b.isCrawlingFailure ? ( + Failed + ) : ( + b.statusCode + )} + + + recrawlBookmark({ bookmarkId: b.id })} + className="flex items-center gap-2" + > + + {t("actions.recrawl")} + + deleteBookmark({ bookmarkId: b.id })} + loading={isDeleting} + className="flex items-center gap-2" + > + + {t("actions.delete")} + + + + ))} + + +
+ )} +
+
+ ); +} diff --git a/apps/web/components/settings/sidebar/items.tsx b/apps/web/components/settings/sidebar/items.tsx index 43dfabdd..f76d494a 100644 --- a/apps/web/components/settings/sidebar/items.tsx +++ b/apps/web/components/settings/sidebar/items.tsx @@ -4,6 +4,7 @@ import { ArrowLeft, Download, KeyRound, + Link, Rss, Sparkles, User, @@ -46,4 +47,9 @@ export const settingsSidebarItems = ( icon: , path: "/settings/api-keys", }, + { + name: t("settings.broken_links.broken_links"), + icon: , + path: "/settings/broken-links", + }, ]; diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json index 530d489a..9f12487f 100644 --- a/apps/web/lib/i18n/locales/en/translation.json +++ b/apps/web/lib/i18n/locales/en/translation.json @@ -38,6 +38,7 @@ "unfavorite": "Unfavorite", "delete": "Delete", "refresh": "Refresh", + "recrawl": "Recrawl", "download_full_page_archive": "Download Full Page Archive", "edit_tags": "Edit Tags", "add_to_list": "Add to List", @@ -103,6 +104,12 @@ "new_api_key_desc": "Give your API key a unique name", "key_success": "Key was successfully created", "key_success_please_copy": "Please copy the key and store it somewhere safe. Once you close the dialog, you won't be able to access it again." + }, + "broken_links": { + "broken_links": "Broken Links", + "last_crawled_at": "Last Crawled At", + "crawling_status": "Crawling Status", + "crawling_failed": "Crawling Failed" } }, "admin": { diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 3952a287..208de44b 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -241,14 +241,12 @@ async function browserlessCrawlPage(jobId: string, url: string) { const response = await fetch(url, { signal: AbortSignal.timeout(5000), }); - if (!response.ok) { - throw new Error(`Failed to crawl page: ${response.status}`); - } logger.info( `[Crawler][${jobId}] Successfully fetched the content of "${url}". Status: ${response.status}, Size: ${response.size}`, ); return { htmlContent: await response.text(), + statusCode: response.status, screenshot: undefined, url: response.url, }; @@ -260,6 +258,7 @@ async function crawlPage( ): Promise<{ htmlContent: string; screenshot: Buffer | undefined; + statusCode: number; url: string; }> { let browser: Browser | undefined; @@ -282,7 +281,7 @@ async function crawlPage( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", ); - await page.goto(url, { + const response = await page.goto(url, { timeout: serverConfig.crawler.navigateTimeoutSec * 1000, }); logger.info( @@ -328,6 +327,7 @@ async function crawlPage( return { htmlContent, + statusCode: response?.status() ?? 0, screenshot, url: page.url(), }; @@ -583,6 +583,7 @@ async function crawlAndParseUrl( const { htmlContent, screenshot, + statusCode, url: browserUrl, } = await crawlPage(jobId, url); @@ -618,6 +619,7 @@ async function crawlAndParseUrl( content: readableContent?.textContent, htmlContent: readableContent?.content, crawledAt: new Date(), + crawlStatusCode: statusCode, }) .where(eq(bookmarkLinks.id, bookmarkId)); -- cgit v1.2.3-70-g09d2