From 9ce6958ada86dade84e406e4e930775c59abf289 Mon Sep 17 00:00:00 2001 From: kamtschatka Date: Sun, 23 Jun 2024 13:08:27 +0200 Subject: refactor: extract assets into their own database table. #215 (#220) * Allow downloading more content from a webpage and index it #215 added a new table that contains the information about assets for link bookmarks created migration code that transfers the existing data into the new table * Allow downloading more content from a webpage and index it #215 removed the old asset columns from the database updated the UI to use the data from the linkBookmarkAssets array * generalize the assets table to not be linked in particular to links * fix migrations post merge * fix missing asset ids in the getBookmarks call --------- Co-authored-by: MohamedBassem --- apps/workers/crawlerWorker.ts | 100 ++++++++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 29 deletions(-) (limited to 'apps/workers') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index eedb7b1e..e7e5515c 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -27,7 +27,13 @@ import { withTimeout } from "utils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; import { db } from "@hoarder/db"; -import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema"; +import { + assets, + AssetTypes, + bookmarkAssets, + bookmarkLinks, + bookmarks, +} from "@hoarder/db/schema"; import { ASSET_TYPES, deleteAsset, @@ -194,7 +200,10 @@ async function changeBookmarkStatus( async function getBookmarkDetails(bookmarkId: string) { const bookmark = await db.query.bookmarks.findFirst({ where: eq(bookmarks.id, bookmarkId), - with: { link: true }, + with: { + link: true, + assets: true, + }, }); if (!bookmark || !bookmark.link) { @@ -203,9 +212,15 @@ async function getBookmarkDetails(bookmarkId: string) { return { url: bookmark.link.url, userId: bookmark.userId, - screenshotAssetId: bookmark.link.screenshotAssetId, - imageAssetId: bookmark.link.imageAssetId, - fullPageArchiveAssetId: bookmark.link.fullPageArchiveAssetId, + screenshotAssetId: bookmark.assets.find( + (a) => a.assetType == AssetTypes.LINK_SCREENSHOT, + )?.id, + imageAssetId: bookmark.assets.find( + (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE, + )?.id, + fullPageArchiveAssetId: bookmark.assets.find( + (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE, + )?.id, }; } @@ -490,9 +505,9 @@ async function crawlAndParseUrl( userId: string, jobId: string, bookmarkId: string, - oldScreenshotAssetId: string | null, - oldImageAssetId: string | null, - oldFullPageArchiveAssetId: string | null, + oldScreenshotAssetId: string | undefined, + oldImageAssetId: string | undefined, + oldFullPageArchiveAssetId: string | undefined, ) { const { htmlContent, @@ -511,20 +526,42 @@ async function crawlAndParseUrl( } // TODO(important): Restrict the size of content to store - await db - .update(bookmarkLinks) - .set({ - title: meta.title, - description: meta.description, - imageUrl: meta.image, - favicon: meta.logo, - content: readableContent?.textContent, - htmlContent: readableContent?.content, - screenshotAssetId, - imageAssetId, - crawledAt: new Date(), - }) - .where(eq(bookmarkLinks.id, bookmarkId)); + await db.transaction(async (txn) => { + await txn + .update(bookmarkLinks) + .set({ + title: meta.title, + description: meta.description, + imageUrl: meta.image, + favicon: meta.logo, + content: readableContent?.textContent, + htmlContent: readableContent?.content, + crawledAt: new Date(), + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + + if (screenshotAssetId) { + if (oldScreenshotAssetId) { + await txn.delete(assets).where(eq(assets.id, oldScreenshotAssetId)); + } + await txn.insert(assets).values({ + id: screenshotAssetId, + assetType: AssetTypes.LINK_SCREENSHOT, + bookmarkId, + }); + } + + if (imageAssetId) { + if (oldImageAssetId) { + await txn.delete(assets).where(eq(assets.id, oldImageAssetId)); + } + await txn.insert(assets).values({ + id: imageAssetId, + assetType: AssetTypes.LINK_BANNER_IMAGE, + bookmarkId, + }); + } + }); // Delete the old assets if any await Promise.all([ @@ -545,13 +582,18 @@ async function crawlAndParseUrl( jobId, ); - await db - .update(bookmarkLinks) - .set({ - fullPageArchiveAssetId, - }) - .where(eq(bookmarkLinks.id, bookmarkId)); - + await db.transaction(async (txn) => { + if (oldFullPageArchiveAssetId) { + await txn + .delete(assets) + .where(eq(assets.id, oldFullPageArchiveAssetId)); + } + await txn.insert(assets).values({ + id: fullPageArchiveAssetId, + assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE, + bookmarkId, + }); + }); if (oldFullPageArchiveAssetId) { deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( () => ({}), -- cgit v1.2.3-70-g09d2