diff options
| author | Mohamed Bassem <me@mbassem.com> | 2026-02-08 21:07:37 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-02-08 21:07:37 +0000 |
| commit | c8464e303f6e7fba6b88c7f29c0570c2b49a494d (patch) | |
| tree | f9d7a9f94c9ac0030402b9b13f0210b9ca916e08 /apps | |
| parent | 08f1a7973294777db5950d6f973590bdceeb1259 (diff) | |
| download | karakeep-c8464e303f6e7fba6b88c7f29c0570c2b49a494d.tar.zst | |
feat(crawler): Split bookmark metadata updates into two phases for faster feedback (#2467)
* feat(crawler): write metadata to DB early for faster user feedback
Split the single DB transaction in crawlAndParseUrl into two phases:
- Phase 1: Write metadata (title, description, favicon, author, etc.)
immediately after extraction, before downloading assets
- Phase 2: Write content and asset references after all assets are
stored (banner image, screenshot, pdf, html content)
This gives users near-instant feedback with bookmark metadata while
the slower asset downloads and uploads happen in the background.
https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ
* fix(crawler): move crawledAt to phase 2 DB write
crawledAt should only be set once all assets are fully stored, not
during the early metadata write.
https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ
---------
Co-authored-by: Claude <noreply@anthropic.com>
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 54 |
1 files changed, 32 insertions, 22 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 5869354f..9815571e 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -1450,6 +1450,36 @@ async function crawlAndParseUrl( ]); abortSignal.throwIfAborted(); + const parseDate = (date: string | undefined) => { + if (!date) { + return null; + } + try { + return new Date(date); + } catch { + return null; + } + }; + + // Phase 1: Write metadata immediately for fast user feedback. + // Content and asset storage happen later and can be slow (banner + // image download, screenshot/pdf upload, etc.). + await db + .update(bookmarkLinks) + .set({ + title: meta.title, + description: meta.description, + // Don't store data URIs as they're not valid URLs and are usually quite large + imageUrl: meta.image?.startsWith("data:") ? null : meta.image, + favicon: meta.logo, + crawlStatusCode: statusCode, + author: meta.author, + publisher: meta.publisher, + datePublished: parseDate(meta.datePublished), + dateModified: parseDate(meta.dateModified), + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + let readableContent: { content: string } | null = meta.readableContentHtml ? { content: meta.readableContentHtml } : null; @@ -1504,17 +1534,7 @@ async function crawlAndParseUrl( } abortSignal.throwIfAborted(); - const parseDate = (date: string | undefined) => { - if (!date) { - return null; - } - try { - return new Date(date); - } catch { - return null; - } - }; - + // Phase 2: Write content and asset references. // TODO(important): Restrict the size of content to store const assetDeletionTasks: Promise<void>[] = []; const inlineHtmlContent = @@ -1526,22 +1546,12 @@ async function crawlAndParseUrl( await txn .update(bookmarkLinks) .set({ - title: meta.title, - description: meta.description, - // Don't store data URIs as they're not valid URLs and are usually quite large - imageUrl: meta.image?.startsWith("data:") ? null : meta.image, - favicon: meta.logo, + crawledAt: new Date(), htmlContent: inlineHtmlContent, contentAssetId: htmlContentAssetInfo.result === "stored" ? htmlContentAssetInfo.assetId : null, - crawledAt: new Date(), - crawlStatusCode: statusCode, - author: meta.author, - publisher: meta.publisher, - datePublished: parseDate(meta.datePublished), - dateModified: parseDate(meta.dateModified), }) .where(eq(bookmarkLinks.id, bookmarkId)); |
