diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-07-06 21:50:23 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-07-06 22:04:56 +0000 |
| commit | dee3a4d44ddb1999e7dec383889246e87f202d92 (patch) | |
| tree | 1984234f17eed886bc834543e1505ddbfb43228f /apps | |
| parent | 362be3008aa8b036c4c448a86e459044af8784c2 (diff) | |
| download | karakeep-dee3a4d44ddb1999e7dec383889246e87f202d92.tar.zst | |
feat: Store large html content in the asset db
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/web/lib/attachments.tsx | 10 | ||||
| -rw-r--r-- | apps/workers/workerUtils.ts | 3 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 109 | ||||
| -rw-r--r-- | apps/workers/workers/inference/summarize.ts | 12 | ||||
| -rw-r--r-- | apps/workers/workers/inference/tagging.ts | 11 | ||||
| -rw-r--r-- | apps/workers/workers/searchWorker.ts | 9 |
6 files changed, 144 insertions, 10 deletions
diff --git a/apps/web/lib/attachments.tsx b/apps/web/lib/attachments.tsx index 62848d33..ce34b295 100644 --- a/apps/web/lib/attachments.tsx +++ b/apps/web/lib/attachments.tsx @@ -1,4 +1,11 @@ -import { Archive, Camera, Image, Paperclip, Video } from "lucide-react"; +import { + Archive, + Camera, + FileCode, + Image, + Paperclip, + Video, +} from "lucide-react"; import { ZAssetType } from "@karakeep/shared/types/bookmarks"; @@ -10,5 +17,6 @@ export const ASSET_TYPE_TO_ICON: Record<ZAssetType, React.ReactNode> = { bannerImage: <Image className="size-4" />, video: <Video className="size-4" />, bookmarkAsset: <Paperclip className="size-4" />, + linkHtmlContent: <FileCode className="size-4" />, unknown: <Paperclip className="size-4" />, }; diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts index d41df578..3eaf5b4b 100644 --- a/apps/workers/workerUtils.ts +++ b/apps/workers/workerUtils.ts @@ -46,5 +46,8 @@ export async function getBookmarkDetails(bookmarkId: string) { precrawledArchiveAssetId: bookmark.assets
.filter((a) => a.assetType == AssetTypes.LINK_PRECRAWLED_ARCHIVE)
.at(-1)?.id,
+ contentAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_HTML_CONTENT,
+ )?.id,
};
}
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 10106a3b..428ec0f5 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -696,6 +696,75 @@ async function handleAsAssetBookmark( }); } +const HTML_CONTENT_SIZE_THRESHOLD = 50 * 1024; // 50KB + +type StoreHtmlResult = + | { result: "stored"; assetId: string; size: number } + | { result: "store_inline" } + | { result: "not_stored" }; + +async function storeHtmlContent( + htmlContent: string | undefined, + userId: string, + jobId: string, +): Promise<StoreHtmlResult> { + if (!htmlContent) { + return { result: "not_stored" }; + } + + const contentBuffer = Buffer.from(htmlContent, "utf8"); + const contentSize = contentBuffer.byteLength; + + // Only store in assets if content is >= 50KB + if (contentSize < HTML_CONTENT_SIZE_THRESHOLD) { + logger.info( + `[Crawler][${jobId}] HTML content size (${contentSize} bytes) is below threshold, storing inline`, + ); + return { result: "store_inline" }; + } + + try { + const quotaApproved = await checkStorageQuota( + db, + userId, + contentBuffer.byteLength, + ); + const assetId = newAssetId(); + + await saveAsset({ + userId, + assetId, + asset: contentBuffer, + metadata: { + contentType: ASSET_TYPES.TEXT_HTML, + fileName: null, + }, + quotaApproved, + }); + + logger.info( + `[Crawler][${jobId}] Stored large HTML content (${contentSize} bytes) as asset: ${assetId}`, + ); + + return { + result: "stored", + assetId, + size: contentSize, + }; + } catch (error) { + if (error instanceof StorageQuotaError) { + logger.warn( + `[Crawler][${jobId}] Skipping HTML content storage due to quota exceeded: ${error.message}`, + ); + return { result: "not_stored" }; + } + logger.error( + `[Crawler][${jobId}] Failed to store HTML content as asset: ${error}`, + ); + throw error; + } +} + async function crawlAndParseUrl( url: string, userId: string, @@ -704,6 +773,7 @@ async function crawlAndParseUrl( oldScreenshotAssetId: string | undefined, oldImageAssetId: string | undefined, oldFullPageArchiveAssetId: string | undefined, + oldContentAssetId: string | undefined, precrawledArchiveAssetId: string | undefined, archiveFullPage: boolean, abortSignal: AbortSignal, @@ -741,6 +811,12 @@ async function crawlAndParseUrl( extractReadableContent(htmlContent, browserUrl, jobId), storeScreenshot(screenshot, userId, jobId), ]); + + const htmlContentAssetInfo = await storeHtmlContent( + readableContent?.content, + userId, + jobId, + ); abortSignal.throwIfAborted(); let imageAssetInfo: DBAssetType | null = null; if (meta.image) { @@ -784,8 +860,14 @@ async function crawlAndParseUrl( // Don't store data URIs as they're not valid URLs and are usually quite large imageUrl: meta.image?.startsWith("data:") ? null : meta.image, favicon: meta.logo, - content: readableContent?.textContent, - htmlContent: readableContent?.content, + htmlContent: + htmlContentAssetInfo.result === "store_inline" + ? readableContent?.content + : null, + contentAssetId: + htmlContentAssetInfo.result === "stored" + ? htmlContentAssetInfo.assetId + : null, crawledAt: new Date(), crawlStatusCode: statusCode, author: meta.author, @@ -813,12 +895,31 @@ async function crawlAndParseUrl( if (imageAssetInfo) { await updateAsset(oldImageAssetId, imageAssetInfo, txn); } + if (htmlContentAssetInfo.result === "stored") { + await updateAsset( + oldContentAssetId, + { + id: htmlContentAssetInfo.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_HTML_CONTENT, + contentType: ASSET_TYPES.TEXT_HTML, + size: htmlContentAssetInfo.size, + fileName: null, + }, + txn, + ); + } else if (oldContentAssetId) { + // Unlink the old content asset + await txn.delete(assets).where(eq(assets.id, oldContentAssetId)); + } }); // Delete the old assets if any await Promise.all([ silentDeleteAsset(userId, oldScreenshotAssetId), silentDeleteAsset(userId, oldImageAssetId), + silentDeleteAsset(userId, oldContentAssetId), ]); return async () => { @@ -857,7 +958,7 @@ async function crawlAndParseUrl( ); }); if (oldFullPageArchiveAssetId) { - silentDeleteAsset(userId, oldFullPageArchiveAssetId); + await silentDeleteAsset(userId, oldFullPageArchiveAssetId); } } } @@ -882,6 +983,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { screenshotAssetId: oldScreenshotAssetId, imageAssetId: oldImageAssetId, fullPageArchiveAssetId: oldFullPageArchiveAssetId, + contentAssetId: oldContentAssetId, precrawledArchiveAssetId, } = await getBookmarkDetails(bookmarkId); @@ -926,6 +1028,7 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { oldScreenshotAssetId, oldImageAssetId, oldFullPageArchiveAssetId, + oldContentAssetId, precrawledArchiveAssetId, archiveFullPage, job.abortSignal, diff --git a/apps/workers/workers/inference/summarize.ts b/apps/workers/workers/inference/summarize.ts index a832fe0a..2d9c8ef7 100644 --- a/apps/workers/workers/inference/summarize.ts +++ b/apps/workers/workers/inference/summarize.ts @@ -9,6 +9,7 @@ import logger from "@karakeep/shared/logger"; import { buildSummaryPrompt } from "@karakeep/shared/prompts"; import { triggerSearchReindex, ZOpenAIRequest } from "@karakeep/shared/queues"; import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; +import { Bookmark } from "@karakeep/trpc/models/bookmarks"; async function fetchBookmarkDetailsForSummary(bookmarkId: string) { const bookmark = await db.query.bookmarks.findFirst({ @@ -19,7 +20,8 @@ async function fetchBookmarkDetailsForSummary(bookmarkId: string) { columns: { title: true, description: true, - content: true, + htmlContent: true, + contentAssetId: true, publisher: true, author: true, url: true, @@ -57,10 +59,16 @@ export async function runSummarization( let textToSummarize = ""; if (bookmarkData.type === BookmarkTypes.LINK && bookmarkData.link) { const link = bookmarkData.link; + + // Extract plain text content from HTML for summarization + let content = + (await Bookmark.getBookmarkPlainTextContent(link, bookmarkData.userId)) ?? + ""; + textToSummarize = ` Title: ${link.title ?? ""} Description: ${link.description ?? ""} -Content: ${link.content ?? ""} +Content: ${content} Publisher: ${link.publisher ?? ""} Author: ${link.author ?? ""} URL: ${link.url ?? ""} diff --git a/apps/workers/workers/inference/tagging.ts b/apps/workers/workers/inference/tagging.ts index d666b3d9..271eea4b 100644 --- a/apps/workers/workers/inference/tagging.ts +++ b/apps/workers/workers/inference/tagging.ts @@ -21,6 +21,7 @@ import { triggerSearchReindex, triggerWebhook, } from "@karakeep/shared/queues"; +import { Bookmark } from "@karakeep/trpc/models/bookmarks"; const openAIResponseSchema = z.object({ tags: z.array(z.string()), @@ -77,13 +78,17 @@ async function buildPrompt( ) { const prompts = await fetchCustomPrompts(bookmark.userId, "text"); if (bookmark.link) { - if (!bookmark.link.description && !bookmark.link.content) { + let content = + (await Bookmark.getBookmarkPlainTextContent( + bookmark.link, + bookmark.userId, + )) ?? ""; + + if (!bookmark.link.description && !content) { throw new Error( `No content found for link "${bookmark.id}". Skipping ...`, ); } - - const content = bookmark.link.content; return buildTextPrompt( serverConfig.inference.inferredTagLang, prompts, diff --git a/apps/workers/workers/searchWorker.ts b/apps/workers/workers/searchWorker.ts index 13243152..177ca0db 100644 --- a/apps/workers/workers/searchWorker.ts +++ b/apps/workers/workers/searchWorker.ts @@ -11,6 +11,7 @@ import { zSearchIndexingRequestSchema, } from "@karakeep/shared/queues"; import { getSearchIdxClient } from "@karakeep/shared/search"; +import { Bookmark } from "@karakeep/trpc/models/bookmarks"; export class SearchIndexingWorker { static build() { @@ -75,6 +76,12 @@ async function runIndex( throw new Error(`Bookmark ${bookmarkId} not found`); } + // Extract plain text content from HTML for search indexing + const content = await Bookmark.getBookmarkPlainTextContent( + bookmark.link, + bookmark.userId, + ); + const task = await searchClient.addDocuments( [ { @@ -85,7 +92,7 @@ async function runIndex( url: bookmark.link.url, linkTitle: bookmark.link.title, description: bookmark.link.description, - content: bookmark.link.content, + content, publisher: bookmark.link.publisher, author: bookmark.link.author, datePublished: bookmark.link.datePublished, |
