aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/web/components/dashboard/preview/BookmarkPreview.tsx16
-rw-r--r--apps/workers/crawlerWorker.ts220
2 files changed, 177 insertions, 59 deletions
diff --git a/apps/web/components/dashboard/preview/BookmarkPreview.tsx b/apps/web/components/dashboard/preview/BookmarkPreview.tsx
index be11b47b..6a1068af 100644
--- a/apps/web/components/dashboard/preview/BookmarkPreview.tsx
+++ b/apps/web/components/dashboard/preview/BookmarkPreview.tsx
@@ -65,6 +65,16 @@ function CreationTime({ createdAt }: { createdAt: Date }) {
);
}
+function getSourceUrl(bookmark: ZBookmark) {
+ if (bookmark.content.type === "link") {
+ return bookmark.content.url;
+ }
+ if (bookmark.content.type === "asset") {
+ return bookmark.content.sourceUrl;
+ }
+ return null;
+}
+
export default function BookmarkPreview({
bookmarkId,
initialData,
@@ -112,6 +122,8 @@ export default function BookmarkPreview({
}
}
+ const sourceUrl = getSourceUrl(bookmark);
+
return (
<div className="grid h-full grid-rows-3 gap-2 overflow-hidden bg-background lg:grid-cols-3 lg:grid-rows-none">
<div className="row-span-2 h-full w-full overflow-auto p-2 md:col-span-2 lg:row-auto">
@@ -120,9 +132,9 @@ export default function BookmarkPreview({
<div className="lg:col-span1 row-span-1 flex flex-col gap-4 overflow-auto bg-accent p-4 lg:row-auto">
<div className="flex w-full flex-col items-center justify-center gap-y-2">
<EditableTitle bookmark={bookmark} />
- {bookmark.content.type == "link" && (
+ {sourceUrl && (
<Link
- href={bookmark.content.url}
+ href={sourceUrl}
className="flex items-center gap-2 text-gray-400"
>
<span>View Original</span>
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 58f1aa85..eedb7b1e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,5 +1,6 @@
import assert from "assert";
import * as dns from "dns";
+import * as path from "node:path";
import type { Job } from "bullmq";
import type { Browser } from "puppeteer";
import { Readability } from "@mozilla/readability";
@@ -26,8 +27,9 @@ import { withTimeout } from "utils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
-import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
+import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema";
import {
+ ASSET_TYPES,
deleteAsset,
newAssetId,
saveAsset,
@@ -68,7 +70,7 @@ async function startBrowserInstance() {
logger.info(
`[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
);
- return await puppeteer.connect({
+ return puppeteer.connect({
browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
defaultViewport,
});
@@ -83,13 +85,13 @@ async function startBrowserInstance() {
logger.info(
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
- return await puppeteer.connect({
+ return puppeteer.connect({
browserURL: webUrl.toString(),
defaultViewport,
});
} else {
logger.info(`Launching a new browser instance`);
- return await puppeteer.launch({
+ return puppeteer.launch({
headless: serverConfig.crawler.headlessBrowser,
defaultViewport,
});
@@ -271,7 +273,11 @@ async function crawlPage(jobId: string, url: string) {
logger.info(
`[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`,
);
- return { htmlContent, screenshot, url: page.url() };
+ return {
+ htmlContent,
+ screenshot,
+ url: page.url(),
+ };
} finally {
await context.close();
}
@@ -337,22 +343,17 @@ async function storeScreenshot(
return assetId;
}
-async function downloadAndStoreImage(
+async function downloadAndStoreFile(
url: string,
userId: string,
jobId: string,
+ fileType: string,
) {
- if (!serverConfig.crawler.downloadBannerImage) {
- logger.info(
- `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
- );
- return null;
- }
try {
- logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`);
+ logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`);
const response = await fetch(url);
if (!response.ok) {
- throw new Error(`Failed to download image: ${response.status}`);
+ throw new Error(`Failed to download ${fileType}: ${response.status}`);
}
const buffer = await response.arrayBuffer();
const assetId = newAssetId();
@@ -370,18 +371,32 @@ async function downloadAndStoreImage(
});
logger.info(
- `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`,
+ `[Crawler][${jobId}] Downloaded ${fileType} as assetId: ${assetId}`,
);
return assetId;
} catch (e) {
logger.error(
- `[Crawler][${jobId}] Failed to download and store image: ${e}`,
+ `[Crawler][${jobId}] Failed to download and store ${fileType}: ${e}`,
);
return null;
}
}
+async function downloadAndStoreImage(
+ url: string,
+ userId: string,
+ jobId: string,
+) {
+ if (!serverConfig.crawler.downloadBannerImage) {
+ logger.info(
+ `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
+ );
+ return null;
+ }
+ return downloadAndStoreFile(url, userId, jobId, "image");
+}
+
async function archiveWebpage(
html: string,
url: string,
@@ -415,31 +430,70 @@ async function archiveWebpage(
return assetId;
}
-async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
- const jobId = job.id ?? "unknown";
-
- const request = zCrawlLinkRequestSchema.safeParse(job.data);
- if (!request.success) {
+async function getContentType(
+ url: string,
+ jobId: string,
+): Promise<string | null> {
+ try {
+ logger.info(
+ `[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`,
+ );
+ const response = await fetch(url, {
+ method: "HEAD",
+ signal: AbortSignal.timeout(5000),
+ });
+ const contentType = response.headers.get("content-type");
+ logger.info(
+ `[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`,
+ );
+ return contentType;
+ } catch (e) {
logger.error(
- `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+ `[Crawler][${jobId}] Failed to determine the content-type for the url ${url}: ${e}`,
);
- return;
+ return null;
}
+}
- const { bookmarkId } = request.data;
- const {
- url,
- userId,
- screenshotAssetId: oldScreenshotAssetId,
- imageAssetId: oldImageAssetId,
- fullPageArchiveAssetId: oldFullPageArchiveAssetId,
- } = await getBookmarkDetails(bookmarkId);
-
- logger.info(
- `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
- );
- validateUrl(url);
+/**
+ * Downloads the pdf asset from the URL and transforms the linkBookmark to an assetBookmark
+ * @param url the url the user provided
+ * @param userId the id of the user
+ * @param jobId the id of the job for logging
+ * @param bookmarkId the id of the bookmark
+ */
+async function handlePDFAsAssetBookmark(
+ url: string,
+ userId: string,
+ jobId: string,
+ bookmarkId: string,
+) {
+ const assetId = await downloadAndStoreFile(url, userId, jobId, "pdf");
+ if (!assetId) {
+ return;
+ }
+ await db.transaction(async (trx) => {
+ await trx.insert(bookmarkAssets).values({
+ id: bookmarkId,
+ assetType: "pdf",
+ assetId,
+ content: null,
+ fileName: path.basename(new URL(url).pathname),
+ sourceUrl: url,
+ });
+ await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId));
+ });
+}
+async function crawlAndParseUrl(
+ url: string,
+ userId: string,
+ jobId: string,
+ bookmarkId: string,
+ oldScreenshotAssetId: string | null,
+ oldImageAssetId: string | null,
+ oldFullPageArchiveAssetId: string | null,
+) {
const {
htmlContent,
screenshot,
@@ -482,6 +536,78 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
: {},
]);
+ return async () => {
+ if (serverConfig.crawler.fullPageArchive) {
+ const fullPageArchiveAssetId = await archiveWebpage(
+ htmlContent,
+ browserUrl,
+ userId,
+ jobId,
+ );
+
+ await db
+ .update(bookmarkLinks)
+ .set({
+ fullPageArchiveAssetId,
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
+ if (oldFullPageArchiveAssetId) {
+ deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
+ () => ({}),
+ );
+ }
+ }
+ };
+}
+
+async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
+ const jobId = job.id ?? "unknown";
+
+ const request = zCrawlLinkRequestSchema.safeParse(job.data);
+ if (!request.success) {
+ logger.error(
+ `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+ );
+ return;
+ }
+
+ const { bookmarkId } = request.data;
+ const {
+ url,
+ userId,
+ screenshotAssetId: oldScreenshotAssetId,
+ imageAssetId: oldImageAssetId,
+ fullPageArchiveAssetId: oldFullPageArchiveAssetId,
+ } = await getBookmarkDetails(bookmarkId);
+
+ logger.info(
+ `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
+ );
+ validateUrl(url);
+
+ const contentType = await getContentType(url, jobId);
+
+ // Link bookmarks get transformed into asset bookmarks if they point to a pdf asset instead of a webpage
+ const isPdf = contentType === ASSET_TYPES.APPLICATION_PDF;
+
+ let archivalLogic: () => Promise<void> = () => {
+ return Promise.resolve();
+ };
+ if (isPdf) {
+ await handlePDFAsAssetBookmark(url, userId, jobId, bookmarkId);
+ } else {
+ archivalLogic = await crawlAndParseUrl(
+ url,
+ userId,
+ jobId,
+ bookmarkId,
+ oldScreenshotAssetId,
+ oldImageAssetId,
+ oldFullPageArchiveAssetId,
+ );
+ }
+
// Enqueue openai job (if not set, assume it's true for backward compatibility)
if (job.data.runInference !== false) {
OpenAIQueue.add("openai", {
@@ -493,25 +619,5 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
triggerSearchReindex(bookmarkId);
// Do the archival as a separate last step as it has the potential for failure
- if (serverConfig.crawler.fullPageArchive) {
- const fullPageArchiveAssetId = await archiveWebpage(
- htmlContent,
- browserUrl,
- userId,
- jobId,
- );
-
- await db
- .update(bookmarkLinks)
- .set({
- fullPageArchiveAssetId,
- })
- .where(eq(bookmarkLinks.id, bookmarkId));
-
- if (oldFullPageArchiveAssetId) {
- deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
- () => ({}),
- );
- }
- }
+ await archivalLogic();
}