From c8464e303f6e7fba6b88c7f29c0570c2b49a494d Mon Sep 17 00:00:00 2001
From: Mohamed Bassem <me@mbassem.com>
Date: Sun, 8 Feb 2026 21:07:37 +0000
Subject: feat(crawler): Split bookmark metadata updates into two phases for
 faster feedback (#2467)

* feat(crawler): write metadata to DB early for faster user feedback

Split the single DB transaction in crawlAndParseUrl into two phases:
- Phase 1: Write metadata (title, description, favicon, author, etc.)
  immediately after extraction, before downloading assets
- Phase 2: Write content and asset references after all assets are
  stored (banner image, screenshot, pdf, html content)

This gives users near-instant feedback with bookmark metadata while
the slower asset downloads and uploads happen in the background.

https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ

* fix(crawler): move crawledAt to phase 2 DB write

crawledAt should only be set once all assets are fully stored, not
during the early metadata write.

https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ

---------

Co-authored-by: Claude <noreply@anthropic.com>
---
 apps/workers/workers/crawlerWorker.ts | 54 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 22 deletions(-)

(limited to 'apps/workers')
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 5869354f..9815571e 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1450,6 +1450,36 @@ async function crawlAndParseUrl(
       ]);
       abortSignal.throwIfAborted();
 
+      const parseDate = (date: string | undefined) => {
+        if (!date) {
+          return null;
+        }
+        try {
+          return new Date(date);
+        } catch {
+          return null;
+        }
+      };
+
+      // Phase 1: Write metadata immediately for fast user feedback.
+      // Content and asset storage happen later and can be slow (banner
+      // image download, screenshot/pdf upload, etc.).
+      await db
+        .update(bookmarkLinks)
+        .set({
+          title: meta.title,
+          description: meta.description,
+          // Don't store data URIs as they're not valid URLs and are usually quite large
+          imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
+          favicon: meta.logo,
+          crawlStatusCode: statusCode,
+          author: meta.author,
+          publisher: meta.publisher,
+          datePublished: parseDate(meta.datePublished),
+          dateModified: parseDate(meta.dateModified),
+        })
+        .where(eq(bookmarkLinks.id, bookmarkId));
+
       let readableContent: { content: string } | null = meta.readableContentHtml
         ? { content: meta.readableContentHtml }
         : null;
@@ -1504,17 +1534,7 @@ async function crawlAndParseUrl(
       }
       abortSignal.throwIfAborted();
 
-      const parseDate = (date: string | undefined) => {
-        if (!date) {
-          return null;
-        }
-        try {
-          return new Date(date);
-        } catch {
-          return null;
-        }
-      };
-
+      // Phase 2: Write content and asset references.
       // TODO(important): Restrict the size of content to store
       const assetDeletionTasks: Promise<void>[] = [];
       const inlineHtmlContent =
@@ -1526,22 +1546,12 @@ async function crawlAndParseUrl(
         await txn
           .update(bookmarkLinks)
           .set({
-            title: meta.title,
-            description: meta.description,
-            // Don't store data URIs as they're not valid URLs and are usually quite large
-            imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
-            favicon: meta.logo,
+            crawledAt: new Date(),
             htmlContent: inlineHtmlContent,
             contentAssetId:
               htmlContentAssetInfo.result === "stored"
                 ? htmlContentAssetInfo.assetId
                 : null,
-            crawledAt: new Date(),
-            crawlStatusCode: statusCode,
-            author: meta.author,
-            publisher: meta.publisher,
-            datePublished: parseDate(meta.datePublished),
-            dateModified: parseDate(meta.dateModified),
           })
           .where(eq(bookmarkLinks.id, bookmarkId));
 
-- 
cgit v1.2.3-70-g09d2