From c8464e303f6e7fba6b88c7f29c0570c2b49a494d Mon Sep 17 00:00:00 2001 From: Mohamed Bassem Date: Sun, 8 Feb 2026 21:07:37 +0000 Subject: feat(crawler): Split bookmark metadata updates into two phases for faster feedback (#2467) * feat(crawler): write metadata to DB early for faster user feedback Split the single DB transaction in crawlAndParseUrl into two phases: - Phase 1: Write metadata (title, description, favicon, author, etc.) immediately after extraction, before downloading assets - Phase 2: Write content and asset references after all assets are stored (banner image, screenshot, pdf, html content) This gives users near-instant feedback with bookmark metadata while the slower asset downloads and uploads happen in the background. https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ * fix(crawler): move crawledAt to phase 2 DB write crawledAt should only be set once all assets are fully stored, not during the early metadata write. https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ --------- Co-authored-by: Claude --- apps/workers/workers/crawlerWorker.ts | 54 +++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'apps/workers') diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 5869354f..9815571e 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -1450,6 +1450,36 @@ async function crawlAndParseUrl( ]); abortSignal.throwIfAborted(); + const parseDate = (date: string | undefined) => { + if (!date) { + return null; + } + try { + return new Date(date); + } catch { + return null; + } + }; + + // Phase 1: Write metadata immediately for fast user feedback. + // Content and asset storage happen later and can be slow (banner + // image download, screenshot/pdf upload, etc.). + await db + .update(bookmarkLinks) + .set({ + title: meta.title, + description: meta.description, + // Don't store data URIs as they're not valid URLs and are usually quite large + imageUrl: meta.image?.startsWith("data:") ? null : meta.image, + favicon: meta.logo, + crawlStatusCode: statusCode, + author: meta.author, + publisher: meta.publisher, + datePublished: parseDate(meta.datePublished), + dateModified: parseDate(meta.dateModified), + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + let readableContent: { content: string } | null = meta.readableContentHtml ? { content: meta.readableContentHtml } : null; @@ -1504,17 +1534,7 @@ async function crawlAndParseUrl( } abortSignal.throwIfAborted(); - const parseDate = (date: string | undefined) => { - if (!date) { - return null; - } - try { - return new Date(date); - } catch { - return null; - } - }; - + // Phase 2: Write content and asset references. // TODO(important): Restrict the size of content to store const assetDeletionTasks: Promise[] = []; const inlineHtmlContent = @@ -1526,22 +1546,12 @@ async function crawlAndParseUrl( await txn .update(bookmarkLinks) .set({ - title: meta.title, - description: meta.description, - // Don't store data URIs as they're not valid URLs and are usually quite large - imageUrl: meta.image?.startsWith("data:") ? null : meta.image, - favicon: meta.logo, + crawledAt: new Date(), htmlContent: inlineHtmlContent, contentAssetId: htmlContentAssetInfo.result === "stored" ? htmlContentAssetInfo.assetId : null, - crawledAt: new Date(), - crawlStatusCode: statusCode, - author: meta.author, - publisher: meta.publisher, - datePublished: parseDate(meta.datePublished), - dateModified: parseDate(meta.dateModified), }) .where(eq(bookmarkLinks.id, bookmarkId)); -- cgit v1.2.3-70-g09d2