feat(crawler): Split bookmark metadata updates into two phases for faster feedback (#2467)

* feat(crawler): write metadata to DB early for faster user feedback Split the single DB transaction in crawlAndParseUrl into two phases: - Phase 1: Write metadata (title, description, favicon, author, etc.) immediately after extraction, before downloading assets - Phase 2: Write content and asset references after all assets are stored (banner image, screenshot, pdf, html content) This gives users near-instant feedback with bookmark metadata while the slower asset downloads and uploads happen in the background. https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ * fix(crawler): move crawledAt to phase 2 DB write crawledAt should only be set once all assets are fully stored, not during the early metadata write. https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ --------- Co-authored-by: Claude <noreply@anthropic.com>
author: Mohamed Bassem <me@mbassem.com> 2026-02-08 21:07:37 +0000
committer: GitHub <noreply@github.com> 2026-02-08 21:07:37 +0000
commit: c8464e303f6e7fba6b88c7f29c0570c2b49a494d (patch)
tree: f9d7a9f94c9ac0030402b9b13f0210b9ca916e08 /apps
parent: 08f1a7973294777db5950d6f973590bdceeb1259 (diff)
download: karakeep-c8464e303f6e7fba6b88c7f29c0570c2b49a494d.tar.zst
1 files changed, 32 insertions, 22 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 5869354f..9815571e 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1450,6 +1450,36 @@ async function crawlAndParseUrl(
       ]);
       abortSignal.throwIfAborted();
 
+      const parseDate = (date: string | undefined) => {
+        if (!date) {
+          return null;
+        }
+        try {
+          return new Date(date);
+        } catch {
+          return null;
+        }
+      };
+
+      // Phase 1: Write metadata immediately for fast user feedback.
+      // Content and asset storage happen later and can be slow (banner
+      // image download, screenshot/pdf upload, etc.).
+      await db
+        .update(bookmarkLinks)
+        .set({
+          title: meta.title,
+          description: meta.description,
+          // Don't store data URIs as they're not valid URLs and are usually quite large
+          imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
+          favicon: meta.logo,
+          crawlStatusCode: statusCode,
+          author: meta.author,
+          publisher: meta.publisher,
+          datePublished: parseDate(meta.datePublished),
+          dateModified: parseDate(meta.dateModified),
+        })
+        .where(eq(bookmarkLinks.id, bookmarkId));
+
       let readableContent: { content: string } | null = meta.readableContentHtml
         ? { content: meta.readableContentHtml }
         : null;
@@ -1504,17 +1534,7 @@ async function crawlAndParseUrl(
       }
       abortSignal.throwIfAborted();
 
-      const parseDate = (date: string | undefined) => {
-        if (!date) {
-          return null;
-        }
-        try {
-          return new Date(date);
-        } catch {
-          return null;
-        }
-      };
-
+      // Phase 2: Write content and asset references.
       // TODO(important): Restrict the size of content to store
       const assetDeletionTasks: Promise<void>[] = [];
       const inlineHtmlContent =
@@ -1526,22 +1546,12 @@ async function crawlAndParseUrl(
         await txn
           .update(bookmarkLinks)
           .set({
-            title: meta.title,
-            description: meta.description,
-            // Don't store data URIs as they're not valid URLs and are usually quite large
-            imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
-            favicon: meta.logo,
+            crawledAt: new Date(),
             htmlContent: inlineHtmlContent,
             contentAssetId:
               htmlContentAssetInfo.result === "stored"
                 ? htmlContentAssetInfo.assetId
                 : null,
-            crawledAt: new Date(),
-            crawlStatusCode: statusCode,
-            author: meta.author,
-            publisher: meta.publisher,
-            datePublished: parseDate(meta.datePublished),
-            dateModified: parseDate(meta.dateModified),
           })
           .where(eq(bookmarkLinks.id, bookmarkId));
author	Mohamed Bassem <me@mbassem.com>	2026-02-08 21:07:37 +0000
committer	GitHub <noreply@github.com>	2026-02-08 21:07:37 +0000
commit	c8464e303f6e7fba6b88c7f29c0570c2b49a494d (patch)
tree	f9d7a9f94c9ac0030402b9b13f0210b9ca916e08 /apps
parent	08f1a7973294777db5950d6f973590bdceeb1259 (diff)
download	karakeep-c8464e303f6e7fba6b88c7f29c0570c2b49a494d.tar.zst