aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2026-02-08 21:07:37 +0000
committerGitHub <noreply@github.com>2026-02-08 21:07:37 +0000
commitc8464e303f6e7fba6b88c7f29c0570c2b49a494d (patch)
treef9d7a9f94c9ac0030402b9b13f0210b9ca916e08 /apps
parent08f1a7973294777db5950d6f973590bdceeb1259 (diff)
downloadkarakeep-c8464e303f6e7fba6b88c7f29c0570c2b49a494d.tar.zst
feat(crawler): Split bookmark metadata updates into two phases for faster feedback (#2467)
* feat(crawler): write metadata to DB early for faster user feedback Split the single DB transaction in crawlAndParseUrl into two phases: - Phase 1: Write metadata (title, description, favicon, author, etc.) immediately after extraction, before downloading assets - Phase 2: Write content and asset references after all assets are stored (banner image, screenshot, pdf, html content) This gives users near-instant feedback with bookmark metadata while the slower asset downloads and uploads happen in the background. https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ * fix(crawler): move crawledAt to phase 2 DB write crawledAt should only be set once all assets are fully stored, not during the early metadata write. https://claude.ai/code/session_013vKTXDcb5CEve3WMszQJmZ --------- Co-authored-by: Claude <noreply@anthropic.com>
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/workers/crawlerWorker.ts54
1 files changed, 32 insertions, 22 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 5869354f..9815571e 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1450,6 +1450,36 @@ async function crawlAndParseUrl(
]);
abortSignal.throwIfAborted();
+ const parseDate = (date: string | undefined) => {
+ if (!date) {
+ return null;
+ }
+ try {
+ return new Date(date);
+ } catch {
+ return null;
+ }
+ };
+
+ // Phase 1: Write metadata immediately for fast user feedback.
+ // Content and asset storage happen later and can be slow (banner
+ // image download, screenshot/pdf upload, etc.).
+ await db
+ .update(bookmarkLinks)
+ .set({
+ title: meta.title,
+ description: meta.description,
+ // Don't store data URIs as they're not valid URLs and are usually quite large
+ imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
+ favicon: meta.logo,
+ crawlStatusCode: statusCode,
+ author: meta.author,
+ publisher: meta.publisher,
+ datePublished: parseDate(meta.datePublished),
+ dateModified: parseDate(meta.dateModified),
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
let readableContent: { content: string } | null = meta.readableContentHtml
? { content: meta.readableContentHtml }
: null;
@@ -1504,17 +1534,7 @@ async function crawlAndParseUrl(
}
abortSignal.throwIfAborted();
- const parseDate = (date: string | undefined) => {
- if (!date) {
- return null;
- }
- try {
- return new Date(date);
- } catch {
- return null;
- }
- };
-
+ // Phase 2: Write content and asset references.
// TODO(important): Restrict the size of content to store
const assetDeletionTasks: Promise<void>[] = [];
const inlineHtmlContent =
@@ -1526,22 +1546,12 @@ async function crawlAndParseUrl(
await txn
.update(bookmarkLinks)
.set({
- title: meta.title,
- description: meta.description,
- // Don't store data URIs as they're not valid URLs and are usually quite large
- imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
- favicon: meta.logo,
+ crawledAt: new Date(),
htmlContent: inlineHtmlContent,
contentAssetId:
htmlContentAssetInfo.result === "stored"
? htmlContentAssetInfo.assetId
: null,
- crawledAt: new Date(),
- crawlStatusCode: statusCode,
- author: meta.author,
- publisher: meta.publisher,
- datePublished: parseDate(meta.datePublished),
- dateModified: parseDate(meta.dateModified),
})
.where(eq(bookmarkLinks.id, bookmarkId));