aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-07-06 21:50:23 +0000
committerMohamed Bassem <me@mbassem.com>2025-07-06 22:04:56 +0000
commitdee3a4d44ddb1999e7dec383889246e87f202d92 (patch)
tree1984234f17eed886bc834543e1505ddbfb43228f /apps
parent362be3008aa8b036c4c448a86e459044af8784c2 (diff)
downloadkarakeep-dee3a4d44ddb1999e7dec383889246e87f202d92.tar.zst
feat: Store large html content in the asset db
Diffstat (limited to 'apps')
-rw-r--r--apps/web/lib/attachments.tsx10
-rw-r--r--apps/workers/workerUtils.ts3
-rw-r--r--apps/workers/workers/crawlerWorker.ts109
-rw-r--r--apps/workers/workers/inference/summarize.ts12
-rw-r--r--apps/workers/workers/inference/tagging.ts11
-rw-r--r--apps/workers/workers/searchWorker.ts9
6 files changed, 144 insertions, 10 deletions
diff --git a/apps/web/lib/attachments.tsx b/apps/web/lib/attachments.tsx
index 62848d33..ce34b295 100644
--- a/apps/web/lib/attachments.tsx
+++ b/apps/web/lib/attachments.tsx
@@ -1,4 +1,11 @@
-import { Archive, Camera, Image, Paperclip, Video } from "lucide-react";
+import {
+ Archive,
+ Camera,
+ FileCode,
+ Image,
+ Paperclip,
+ Video,
+} from "lucide-react";
import { ZAssetType } from "@karakeep/shared/types/bookmarks";
@@ -10,5 +17,6 @@ export const ASSET_TYPE_TO_ICON: Record<ZAssetType, React.ReactNode> = {
bannerImage: <Image className="size-4" />,
video: <Video className="size-4" />,
bookmarkAsset: <Paperclip className="size-4" />,
+ linkHtmlContent: <FileCode className="size-4" />,
unknown: <Paperclip className="size-4" />,
};
diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts
index d41df578..3eaf5b4b 100644
--- a/apps/workers/workerUtils.ts
+++ b/apps/workers/workerUtils.ts
@@ -46,5 +46,8 @@ export async function getBookmarkDetails(bookmarkId: string) {
precrawledArchiveAssetId: bookmark.assets
.filter((a) => a.assetType == AssetTypes.LINK_PRECRAWLED_ARCHIVE)
.at(-1)?.id,
+ contentAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_HTML_CONTENT,
+ )?.id,
};
}
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 10106a3b..428ec0f5 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -696,6 +696,75 @@ async function handleAsAssetBookmark(
});
}
+const HTML_CONTENT_SIZE_THRESHOLD = 50 * 1024; // 50KB
+
+type StoreHtmlResult =
+ | { result: "stored"; assetId: string; size: number }
+ | { result: "store_inline" }
+ | { result: "not_stored" };
+
+async function storeHtmlContent(
+ htmlContent: string | undefined,
+ userId: string,
+ jobId: string,
+): Promise<StoreHtmlResult> {
+ if (!htmlContent) {
+ return { result: "not_stored" };
+ }
+
+ const contentBuffer = Buffer.from(htmlContent, "utf8");
+ const contentSize = contentBuffer.byteLength;
+
+ // Only store in assets if content is >= 50KB
+ if (contentSize < HTML_CONTENT_SIZE_THRESHOLD) {
+ logger.info(
+ `[Crawler][${jobId}] HTML content size (${contentSize} bytes) is below threshold, storing inline`,
+ );
+ return { result: "store_inline" };
+ }
+
+ try {
+ const quotaApproved = await checkStorageQuota(
+ db,
+ userId,
+ contentBuffer.byteLength,
+ );
+ const assetId = newAssetId();
+
+ await saveAsset({
+ userId,
+ assetId,
+ asset: contentBuffer,
+ metadata: {
+ contentType: ASSET_TYPES.TEXT_HTML,
+ fileName: null,
+ },
+ quotaApproved,
+ });
+
+ logger.info(
+ `[Crawler][${jobId}] Stored large HTML content (${contentSize} bytes) as asset: ${assetId}`,
+ );
+
+ return {
+ result: "stored",
+ assetId,
+ size: contentSize,
+ };
+ } catch (error) {
+ if (error instanceof StorageQuotaError) {
+ logger.warn(
+ `[Crawler][${jobId}] Skipping HTML content storage due to quota exceeded: ${error.message}`,
+ );
+ return { result: "not_stored" };
+ }
+ logger.error(
+ `[Crawler][${jobId}] Failed to store HTML content as asset: ${error}`,
+ );
+ throw error;
+ }
+}
+
async function crawlAndParseUrl(
url: string,
userId: string,
@@ -704,6 +773,7 @@ async function crawlAndParseUrl(
oldScreenshotAssetId: string | undefined,
oldImageAssetId: string | undefined,
oldFullPageArchiveAssetId: string | undefined,
+ oldContentAssetId: string | undefined,
precrawledArchiveAssetId: string | undefined,
archiveFullPage: boolean,
abortSignal: AbortSignal,
@@ -741,6 +811,12 @@ async function crawlAndParseUrl(
extractReadableContent(htmlContent, browserUrl, jobId),
storeScreenshot(screenshot, userId, jobId),
]);
+
+ const htmlContentAssetInfo = await storeHtmlContent(
+ readableContent?.content,
+ userId,
+ jobId,
+ );
abortSignal.throwIfAborted();
let imageAssetInfo: DBAssetType | null = null;
if (meta.image) {
@@ -784,8 +860,14 @@ async function crawlAndParseUrl(
// Don't store data URIs as they're not valid URLs and are usually quite large
imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
favicon: meta.logo,
- content: readableContent?.textContent,
- htmlContent: readableContent?.content,
+ htmlContent:
+ htmlContentAssetInfo.result === "store_inline"
+ ? readableContent?.content
+ : null,
+ contentAssetId:
+ htmlContentAssetInfo.result === "stored"
+ ? htmlContentAssetInfo.assetId
+ : null,
crawledAt: new Date(),
crawlStatusCode: statusCode,
author: meta.author,
@@ -813,12 +895,31 @@ async function crawlAndParseUrl(
if (imageAssetInfo) {
await updateAsset(oldImageAssetId, imageAssetInfo, txn);
}
+ if (htmlContentAssetInfo.result === "stored") {
+ await updateAsset(
+ oldContentAssetId,
+ {
+ id: htmlContentAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_HTML_CONTENT,
+ contentType: ASSET_TYPES.TEXT_HTML,
+ size: htmlContentAssetInfo.size,
+ fileName: null,
+ },
+ txn,
+ );
+ } else if (oldContentAssetId) {
+ // Unlink the old content asset
+ await txn.delete(assets).where(eq(assets.id, oldContentAssetId));
+ }
});
// Delete the old assets if any
await Promise.all([
silentDeleteAsset(userId, oldScreenshotAssetId),
silentDeleteAsset(userId, oldImageAssetId),
+ silentDeleteAsset(userId, oldContentAssetId),
]);
return async () => {
@@ -857,7 +958,7 @@ async function crawlAndParseUrl(
);
});
if (oldFullPageArchiveAssetId) {
- silentDeleteAsset(userId, oldFullPageArchiveAssetId);
+ await silentDeleteAsset(userId, oldFullPageArchiveAssetId);
}
}
}
@@ -882,6 +983,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
screenshotAssetId: oldScreenshotAssetId,
imageAssetId: oldImageAssetId,
fullPageArchiveAssetId: oldFullPageArchiveAssetId,
+ contentAssetId: oldContentAssetId,
precrawledArchiveAssetId,
} = await getBookmarkDetails(bookmarkId);
@@ -926,6 +1028,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
oldScreenshotAssetId,
oldImageAssetId,
oldFullPageArchiveAssetId,
+ oldContentAssetId,
precrawledArchiveAssetId,
archiveFullPage,
job.abortSignal,
diff --git a/apps/workers/workers/inference/summarize.ts b/apps/workers/workers/inference/summarize.ts
index a832fe0a..2d9c8ef7 100644
--- a/apps/workers/workers/inference/summarize.ts
+++ b/apps/workers/workers/inference/summarize.ts
@@ -9,6 +9,7 @@ import logger from "@karakeep/shared/logger";
import { buildSummaryPrompt } from "@karakeep/shared/prompts";
import { triggerSearchReindex, ZOpenAIRequest } from "@karakeep/shared/queues";
import { BookmarkTypes } from "@karakeep/shared/types/bookmarks";
+import { Bookmark } from "@karakeep/trpc/models/bookmarks";
async function fetchBookmarkDetailsForSummary(bookmarkId: string) {
const bookmark = await db.query.bookmarks.findFirst({
@@ -19,7 +20,8 @@ async function fetchBookmarkDetailsForSummary(bookmarkId: string) {
columns: {
title: true,
description: true,
- content: true,
+ htmlContent: true,
+ contentAssetId: true,
publisher: true,
author: true,
url: true,
@@ -57,10 +59,16 @@ export async function runSummarization(
let textToSummarize = "";
if (bookmarkData.type === BookmarkTypes.LINK && bookmarkData.link) {
const link = bookmarkData.link;
+
+ // Extract plain text content from HTML for summarization
+ let content =
+ (await Bookmark.getBookmarkPlainTextContent(link, bookmarkData.userId)) ??
+ "";
+
textToSummarize = `
Title: ${link.title ?? ""}
Description: ${link.description ?? ""}
-Content: ${link.content ?? ""}
+Content: ${content}
Publisher: ${link.publisher ?? ""}
Author: ${link.author ?? ""}
URL: ${link.url ?? ""}
diff --git a/apps/workers/workers/inference/tagging.ts b/apps/workers/workers/inference/tagging.ts
index d666b3d9..271eea4b 100644
--- a/apps/workers/workers/inference/tagging.ts
+++ b/apps/workers/workers/inference/tagging.ts
@@ -21,6 +21,7 @@ import {
triggerSearchReindex,
triggerWebhook,
} from "@karakeep/shared/queues";
+import { Bookmark } from "@karakeep/trpc/models/bookmarks";
const openAIResponseSchema = z.object({
tags: z.array(z.string()),
@@ -77,13 +78,17 @@ async function buildPrompt(
) {
const prompts = await fetchCustomPrompts(bookmark.userId, "text");
if (bookmark.link) {
- if (!bookmark.link.description && !bookmark.link.content) {
+ let content =
+ (await Bookmark.getBookmarkPlainTextContent(
+ bookmark.link,
+ bookmark.userId,
+ )) ?? "";
+
+ if (!bookmark.link.description && !content) {
throw new Error(
`No content found for link "${bookmark.id}". Skipping ...`,
);
}
-
- const content = bookmark.link.content;
return buildTextPrompt(
serverConfig.inference.inferredTagLang,
prompts,
diff --git a/apps/workers/workers/searchWorker.ts b/apps/workers/workers/searchWorker.ts
index 13243152..177ca0db 100644
--- a/apps/workers/workers/searchWorker.ts
+++ b/apps/workers/workers/searchWorker.ts
@@ -11,6 +11,7 @@ import {
zSearchIndexingRequestSchema,
} from "@karakeep/shared/queues";
import { getSearchIdxClient } from "@karakeep/shared/search";
+import { Bookmark } from "@karakeep/trpc/models/bookmarks";
export class SearchIndexingWorker {
static build() {
@@ -75,6 +76,12 @@ async function runIndex(
throw new Error(`Bookmark ${bookmarkId} not found`);
}
+ // Extract plain text content from HTML for search indexing
+ const content = await Bookmark.getBookmarkPlainTextContent(
+ bookmark.link,
+ bookmark.userId,
+ );
+
const task = await searchClient.addDocuments(
[
{
@@ -85,7 +92,7 @@ async function runIndex(
url: bookmark.link.url,
linkTitle: bookmark.link.title,
description: bookmark.link.description,
- content: bookmark.link.content,
+ content,
publisher: bookmark.link.publisher,
author: bookmark.link.author,
datePublished: bookmark.link.datePublished,