aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-10-06 14:33:40 +0000
committerMohamedBassem <me@mbassem.com>2024-10-06 15:51:45 +0000
commit1e5c575e16c8a9e6bd7592e83bea53af7f359e15 (patch)
treed706b7d0dad309e1c4e5ede15fcae8a5e2547c8a /apps/workers/crawlerWorker.ts
parentdb2d346cd6c265d8e7c69383c6de03bf5f6252f0 (diff)
downloadkarakeep-1e5c575e16c8a9e6bd7592e83bea53af7f359e15.tar.zst
refactor: Start tracking bookmark assets in the assets table
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts143
1 files changed, 83 insertions, 60 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index f830c500..74413c63 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -36,6 +36,7 @@ import { DequeuedJob, Runner } from "@hoarder/queue";
import {
ASSET_TYPES,
deleteAsset,
+ getAssetSize,
IMAGE_ASSET_TYPES,
newAssetId,
saveAsset,
@@ -192,6 +193,8 @@ export class CrawlerWorker {
}
}
+type DBAssetType = typeof assets.$inferInsert;
+
async function changeBookmarkStatus(
bookmarkId: string,
crawlStatus: "success" | "failure",
@@ -353,16 +356,18 @@ async function storeScreenshot(
return null;
}
const assetId = newAssetId();
+ const contentType = "image/png";
+ const fileName = "screenshot.png";
await saveAsset({
userId,
assetId,
- metadata: { contentType: "image/png", fileName: "screenshot.png" },
+ metadata: { contentType, fileName },
asset: screenshot,
});
logger.info(
`[Crawler][${jobId}] Stored the screenshot as assetId: ${assetId}`,
);
- return assetId;
+ return { assetId, contentType, fileName, size: screenshot.byteLength };
}
async function downloadAndStoreFile(
@@ -396,7 +401,7 @@ async function downloadAndStoreFile(
`[Crawler][${jobId}] Downloaded ${fileType} as assetId: ${assetId}`,
);
- return assetId;
+ return { assetId, userId, contentType, size: buffer.byteLength };
} catch (e) {
logger.error(
`[Crawler][${jobId}] Failed to download and store ${fileType}: ${e}`,
@@ -433,12 +438,14 @@ async function archiveWebpage(
input: html,
})`monolith - -Ije -t 5 -b ${url} -o ${assetPath}`;
+ const contentType = "text/html";
+
await saveAssetFromFile({
userId,
assetId,
assetPath,
metadata: {
- contentType: "text/html",
+ contentType,
},
});
@@ -446,7 +453,11 @@ async function archiveWebpage(
`[Crawler][${jobId}] Done archiving the page as assetId: ${assetId}`,
);
- return assetId;
+ return {
+ assetId,
+ contentType,
+ size: await getAssetSize({ userId, assetId }),
+ };
}
async function getContentType(
@@ -489,17 +500,31 @@ async function handleAsAssetBookmark(
jobId: string,
bookmarkId: string,
) {
- const assetId = await downloadAndStoreFile(url, userId, jobId, assetType);
- if (!assetId) {
+ const downloaded = await downloadAndStoreFile(url, userId, jobId, assetType);
+ if (!downloaded) {
return;
}
+ const fileName = path.basename(new URL(url).pathname);
await db.transaction(async (trx) => {
+ await updateAsset(
+ undefined,
+ {
+ id: downloaded.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.BOOKMARK_ASSET,
+ contentType: downloaded.contentType,
+ size: downloaded.size,
+ fileName,
+ },
+ trx,
+ );
await trx.insert(bookmarkAssets).values({
id: bookmarkId,
assetType,
- assetId,
+ assetId: downloaded.assetId,
content: null,
- fileName: path.basename(new URL(url).pathname),
+ fileName,
sourceUrl: url,
});
// Switch the type of the bookmark from LINK to ASSET
@@ -527,14 +552,24 @@ async function crawlAndParseUrl(
url: browserUrl,
} = await crawlPage(jobId, url);
- const [meta, readableContent, screenshotAssetId] = await Promise.all([
+ const [meta, readableContent, screenshotAssetInfo] = await Promise.all([
extractMetadata(htmlContent, browserUrl, jobId),
extractReadableContent(htmlContent, browserUrl, jobId),
storeScreenshot(screenshot, userId, jobId),
]);
- let imageAssetId: string | null = null;
+ let imageAssetInfo: DBAssetType | null = null;
if (meta.image) {
- imageAssetId = await downloadAndStoreImage(meta.image, userId, jobId);
+ const downloaded = await downloadAndStoreImage(meta.image, userId, jobId);
+ if (downloaded) {
+ imageAssetInfo = {
+ id: downloaded.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_BANNER_IMAGE,
+ contentType: downloaded.contentType,
+ size: downloaded.size,
+ };
+ }
}
// TODO(important): Restrict the size of content to store
@@ -552,22 +587,24 @@ async function crawlAndParseUrl(
})
.where(eq(bookmarkLinks.id, bookmarkId));
- await updateAsset(
- screenshotAssetId,
- oldScreenshotAssetId,
- bookmarkId,
- userId,
- AssetTypes.LINK_SCREENSHOT,
- txn,
- );
- await updateAsset(
- imageAssetId,
- oldImageAssetId,
- bookmarkId,
- userId,
- AssetTypes.LINK_BANNER_IMAGE,
- txn,
- );
+ if (screenshotAssetInfo) {
+ await updateAsset(
+ oldScreenshotAssetId,
+ {
+ id: screenshotAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_SCREENSHOT,
+ contentType: screenshotAssetInfo.contentType,
+ size: screenshotAssetInfo.size,
+ fileName: screenshotAssetInfo.fileName,
+ },
+ txn,
+ );
+ }
+ if (imageAssetInfo) {
+ await updateAsset(oldImageAssetId, imageAssetInfo, txn);
+ }
});
// Delete the old assets if any
@@ -582,20 +619,24 @@ async function crawlAndParseUrl(
return async () => {
if (serverConfig.crawler.fullPageArchive || archiveFullPage) {
- const fullPageArchiveAssetId = await archiveWebpage(
- htmlContent,
- browserUrl,
- userId,
- jobId,
- );
+ const {
+ assetId: fullPageArchiveAssetId,
+ size,
+ contentType,
+ } = await archiveWebpage(htmlContent, browserUrl, userId, jobId);
await db.transaction(async (txn) => {
await updateAsset(
- fullPageArchiveAssetId,
oldFullPageArchiveAssetId,
- bookmarkId,
- userId,
- AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ {
+ id: fullPageArchiveAssetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ contentType,
+ size,
+ fileName: null,
+ },
txn,
);
});
@@ -676,31 +717,13 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
await archivalLogic();
}
-/**
- * Removes the old asset and adds a new one instead
- * @param newAssetId the new assetId to add
- * @param oldAssetId the old assetId to remove (if it exists)
- * @param bookmarkId the id of the bookmark the asset belongs to
- * @param assetType the type of the asset
- * @param txn the transaction where this update should happen in
- */
async function updateAsset(
- newAssetId: string | null,
oldAssetId: string | undefined,
- bookmarkId: string,
- userId: string,
- assetType: AssetTypes,
+ newAsset: DBAssetType,
txn: HoarderDBTransaction,
) {
- if (newAssetId) {
- if (oldAssetId) {
- await txn.delete(assets).where(eq(assets.id, oldAssetId));
- }
- await txn.insert(assets).values({
- id: newAssetId,
- assetType,
- bookmarkId,
- userId,
- });
+ if (oldAssetId) {
+ await txn.delete(assets).where(eq(assets.id, oldAssetId));
}
+ await txn.insert(assets).values(newAsset);
}