aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
authorkamtschatka <simon.schatka@gmx.at>2024-06-22 18:52:40 +0200
committerGitHub <noreply@github.com>2024-06-22 17:52:40 +0100
commitbe1b7f7e1c0cb3d905e13aa1a95e295b816cbdeb (patch)
tree6a0556a1bfd4fee1a10c99f88bd39a316107b631 /apps/workers/crawlerWorker.ts
parentccfff6b1954030a273b0612f3772ec00a82422c8 (diff)
downloadkarakeep-be1b7f7e1c0cb3d905e13aa1a95e295b816cbdeb.tar.zst
feature: add support for PDF links. Fixes #28 (#216)
* feature request: pdf support #28 Added a new sourceUrl column to the asset bookmarks Added transforming a link bookmark pointing at a pdf to an asset bookmark made sure the "View Original" link is also shown for asset bookmarks that have a sourceURL updated gitignore for IDEA * remove pdf parsing from the crawler * extract the http logic into its own function to avoid duplicating the post-processing actions (openai/index) * Add 5s timeout to the content type fetch --------- Co-authored-by: MohamedBassem <me@mbassem.com>
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts220
1 files changed, 163 insertions, 57 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 58f1aa85..eedb7b1e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -1,5 +1,6 @@
import assert from "assert";
import * as dns from "dns";
+import * as path from "node:path";
import type { Job } from "bullmq";
import type { Browser } from "puppeteer";
import { Readability } from "@mozilla/readability";
@@ -26,8 +27,9 @@ import { withTimeout } from "utils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
-import { bookmarkLinks, bookmarks } from "@hoarder/db/schema";
+import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema";
import {
+ ASSET_TYPES,
deleteAsset,
newAssetId,
saveAsset,
@@ -68,7 +70,7 @@ async function startBrowserInstance() {
logger.info(
`[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
);
- return await puppeteer.connect({
+ return puppeteer.connect({
browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
defaultViewport,
});
@@ -83,13 +85,13 @@ async function startBrowserInstance() {
logger.info(
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
- return await puppeteer.connect({
+ return puppeteer.connect({
browserURL: webUrl.toString(),
defaultViewport,
});
} else {
logger.info(`Launching a new browser instance`);
- return await puppeteer.launch({
+ return puppeteer.launch({
headless: serverConfig.crawler.headlessBrowser,
defaultViewport,
});
@@ -271,7 +273,11 @@ async function crawlPage(jobId: string, url: string) {
logger.info(
`[Crawler][${jobId}] Finished capturing page content and a screenshot. FullPageScreenshot: ${serverConfig.crawler.fullPageScreenshot}`,
);
- return { htmlContent, screenshot, url: page.url() };
+ return {
+ htmlContent,
+ screenshot,
+ url: page.url(),
+ };
} finally {
await context.close();
}
@@ -337,22 +343,17 @@ async function storeScreenshot(
return assetId;
}
-async function downloadAndStoreImage(
+async function downloadAndStoreFile(
url: string,
userId: string,
jobId: string,
+ fileType: string,
) {
- if (!serverConfig.crawler.downloadBannerImage) {
- logger.info(
- `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
- );
- return null;
- }
try {
- logger.info(`[Crawler][${jobId}] Downloading image from "${url}"`);
+ logger.info(`[Crawler][${jobId}] Downloading ${fileType} from "${url}"`);
const response = await fetch(url);
if (!response.ok) {
- throw new Error(`Failed to download image: ${response.status}`);
+ throw new Error(`Failed to download ${fileType}: ${response.status}`);
}
const buffer = await response.arrayBuffer();
const assetId = newAssetId();
@@ -370,18 +371,32 @@ async function downloadAndStoreImage(
});
logger.info(
- `[Crawler][${jobId}] Downloaded the image as assetId: ${assetId}`,
+ `[Crawler][${jobId}] Downloaded ${fileType} as assetId: ${assetId}`,
);
return assetId;
} catch (e) {
logger.error(
- `[Crawler][${jobId}] Failed to download and store image: ${e}`,
+ `[Crawler][${jobId}] Failed to download and store ${fileType}: ${e}`,
);
return null;
}
}
+async function downloadAndStoreImage(
+ url: string,
+ userId: string,
+ jobId: string,
+) {
+ if (!serverConfig.crawler.downloadBannerImage) {
+ logger.info(
+ `[Crawler][${jobId}] Skipping downloading the image as per the config.`,
+ );
+ return null;
+ }
+ return downloadAndStoreFile(url, userId, jobId, "image");
+}
+
async function archiveWebpage(
html: string,
url: string,
@@ -415,31 +430,70 @@ async function archiveWebpage(
return assetId;
}
-async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
- const jobId = job.id ?? "unknown";
-
- const request = zCrawlLinkRequestSchema.safeParse(job.data);
- if (!request.success) {
+async function getContentType(
+ url: string,
+ jobId: string,
+): Promise<string | null> {
+ try {
+ logger.info(
+ `[Crawler][${jobId}] Attempting to determine the content-type for the url ${url}`,
+ );
+ const response = await fetch(url, {
+ method: "HEAD",
+ signal: AbortSignal.timeout(5000),
+ });
+ const contentType = response.headers.get("content-type");
+ logger.info(
+ `[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`,
+ );
+ return contentType;
+ } catch (e) {
logger.error(
- `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+ `[Crawler][${jobId}] Failed to determine the content-type for the url ${url}: ${e}`,
);
- return;
+ return null;
}
+}
- const { bookmarkId } = request.data;
- const {
- url,
- userId,
- screenshotAssetId: oldScreenshotAssetId,
- imageAssetId: oldImageAssetId,
- fullPageArchiveAssetId: oldFullPageArchiveAssetId,
- } = await getBookmarkDetails(bookmarkId);
-
- logger.info(
- `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
- );
- validateUrl(url);
+/**
+ * Downloads the pdf asset from the URL and transforms the linkBookmark to an assetBookmark
+ * @param url the url the user provided
+ * @param userId the id of the user
+ * @param jobId the id of the job for logging
+ * @param bookmarkId the id of the bookmark
+ */
+async function handlePDFAsAssetBookmark(
+ url: string,
+ userId: string,
+ jobId: string,
+ bookmarkId: string,
+) {
+ const assetId = await downloadAndStoreFile(url, userId, jobId, "pdf");
+ if (!assetId) {
+ return;
+ }
+ await db.transaction(async (trx) => {
+ await trx.insert(bookmarkAssets).values({
+ id: bookmarkId,
+ assetType: "pdf",
+ assetId,
+ content: null,
+ fileName: path.basename(new URL(url).pathname),
+ sourceUrl: url,
+ });
+ await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId));
+ });
+}
+async function crawlAndParseUrl(
+ url: string,
+ userId: string,
+ jobId: string,
+ bookmarkId: string,
+ oldScreenshotAssetId: string | null,
+ oldImageAssetId: string | null,
+ oldFullPageArchiveAssetId: string | null,
+) {
const {
htmlContent,
screenshot,
@@ -482,6 +536,78 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
: {},
]);
+ return async () => {
+ if (serverConfig.crawler.fullPageArchive) {
+ const fullPageArchiveAssetId = await archiveWebpage(
+ htmlContent,
+ browserUrl,
+ userId,
+ jobId,
+ );
+
+ await db
+ .update(bookmarkLinks)
+ .set({
+ fullPageArchiveAssetId,
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
+ if (oldFullPageArchiveAssetId) {
+ deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
+ () => ({}),
+ );
+ }
+ }
+ };
+}
+
+async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
+ const jobId = job.id ?? "unknown";
+
+ const request = zCrawlLinkRequestSchema.safeParse(job.data);
+ if (!request.success) {
+ logger.error(
+ `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+ );
+ return;
+ }
+
+ const { bookmarkId } = request.data;
+ const {
+ url,
+ userId,
+ screenshotAssetId: oldScreenshotAssetId,
+ imageAssetId: oldImageAssetId,
+ fullPageArchiveAssetId: oldFullPageArchiveAssetId,
+ } = await getBookmarkDetails(bookmarkId);
+
+ logger.info(
+ `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
+ );
+ validateUrl(url);
+
+ const contentType = await getContentType(url, jobId);
+
+ // Link bookmarks get transformed into asset bookmarks if they point to a pdf asset instead of a webpage
+ const isPdf = contentType === ASSET_TYPES.APPLICATION_PDF;
+
+ let archivalLogic: () => Promise<void> = () => {
+ return Promise.resolve();
+ };
+ if (isPdf) {
+ await handlePDFAsAssetBookmark(url, userId, jobId, bookmarkId);
+ } else {
+ archivalLogic = await crawlAndParseUrl(
+ url,
+ userId,
+ jobId,
+ bookmarkId,
+ oldScreenshotAssetId,
+ oldImageAssetId,
+ oldFullPageArchiveAssetId,
+ );
+ }
+
// Enqueue openai job (if not set, assume it's true for backward compatibility)
if (job.data.runInference !== false) {
OpenAIQueue.add("openai", {
@@ -493,25 +619,5 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
triggerSearchReindex(bookmarkId);
// Do the archival as a separate last step as it has the potential for failure
- if (serverConfig.crawler.fullPageArchive) {
- const fullPageArchiveAssetId = await archiveWebpage(
- htmlContent,
- browserUrl,
- userId,
- jobId,
- );
-
- await db
- .update(bookmarkLinks)
- .set({
- fullPageArchiveAssetId,
- })
- .where(eq(bookmarkLinks.id, bookmarkId));
-
- if (oldFullPageArchiveAssetId) {
- deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
- () => ({}),
- );
- }
- }
+ await archivalLogic();
}