aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--apps/workers/workers/crawlerWorker.ts54
1 files changed, 32 insertions, 22 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 5869354f..9815571e 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1450,6 +1450,36 @@ async function crawlAndParseUrl(
]);
abortSignal.throwIfAborted();
+ const parseDate = (date: string | undefined) => {
+ if (!date) {
+ return null;
+ }
+ try {
+ return new Date(date);
+ } catch {
+ return null;
+ }
+ };
+
+ // Phase 1: Write metadata immediately for fast user feedback.
+ // Content and asset storage happen later and can be slow (banner
+ // image download, screenshot/pdf upload, etc.).
+ await db
+ .update(bookmarkLinks)
+ .set({
+ title: meta.title,
+ description: meta.description,
+ // Don't store data URIs as they're not valid URLs and are usually quite large
+ imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
+ favicon: meta.logo,
+ crawlStatusCode: statusCode,
+ author: meta.author,
+ publisher: meta.publisher,
+ datePublished: parseDate(meta.datePublished),
+ dateModified: parseDate(meta.dateModified),
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
let readableContent: { content: string } | null = meta.readableContentHtml
? { content: meta.readableContentHtml }
: null;
@@ -1504,17 +1534,7 @@ async function crawlAndParseUrl(
}
abortSignal.throwIfAborted();
- const parseDate = (date: string | undefined) => {
- if (!date) {
- return null;
- }
- try {
- return new Date(date);
- } catch {
- return null;
- }
- };
-
+ // Phase 2: Write content and asset references.
// TODO(important): Restrict the size of content to store
const assetDeletionTasks: Promise<void>[] = [];
const inlineHtmlContent =
@@ -1526,22 +1546,12 @@ async function crawlAndParseUrl(
await txn
.update(bookmarkLinks)
.set({
- title: meta.title,
- description: meta.description,
- // Don't store data URIs as they're not valid URLs and are usually quite large
- imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
- favicon: meta.logo,
+ crawledAt: new Date(),
htmlContent: inlineHtmlContent,
contentAssetId:
htmlContentAssetInfo.result === "stored"
? htmlContentAssetInfo.assetId
: null,
- crawledAt: new Date(),
- crawlStatusCode: statusCode,
- author: meta.author,
- publisher: meta.publisher,
- datePublished: parseDate(meta.datePublished),
- dateModified: parseDate(meta.dateModified),
})
.where(eq(bookmarkLinks.id, bookmarkId));