aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-02-02 15:44:48 +0000
committerMohamed Bassem <me@mbassem.com>2025-02-02 15:47:21 +0000
commit68e27adb029cb7bb7b51b8ea594163931a495c61 (patch)
treef5ec56769072c35adb16c43a7a21686eb93477a2 /apps/workers/crawlerWorker.ts
parentb59fe2ee819acc4c8115c9f6322050e2d1dc2204 (diff)
downloadkarakeep-68e27adb029cb7bb7b51b8ea594163931a495c61.tar.zst
fix: Dont rearchive singlefile uploads and consider them as archives
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts8
1 files changed, 6 insertions, 2 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 91adb185..7611494e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -669,7 +669,8 @@ async function crawlAndParseUrl(
.set({
title: meta.title,
description: meta.description,
- imageUrl: meta.image,
+ // Don't store data URIs as they're not valid URLs and are usually quite large
+ imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
favicon: meta.logo,
content: readableContent?.textContent,
htmlContent: readableContent?.content,
@@ -705,7 +706,10 @@ async function crawlAndParseUrl(
]);
return async () => {
- if (serverConfig.crawler.fullPageArchive || archiveFullPage) {
+ if (
+ !precrawledArchiveAssetId &&
+ (serverConfig.crawler.fullPageArchive || archiveFullPage)
+ ) {
const {
assetId: fullPageArchiveAssetId,
size,