aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
authorkamtschatka <simon.schatka@gmx.at>2024-06-23 13:08:27 +0200
committerGitHub <noreply@github.com>2024-06-23 12:08:27 +0100
commit9ce6958ada86dade84e406e4e930775c59abf289 (patch)
tree39fefd8495d344e2eb336a47d77eefc0f1b7051b /apps/workers
parent0f54a18212b6e34d819e3a3c50f5479c6ce3771b (diff)
downloadkarakeep-9ce6958ada86dade84e406e4e930775c59abf289.tar.zst
refactor: extract assets into their own database table. #215 (#220)
* Allow downloading more content from a webpage and index it #215 added a new table that contains the information about assets for link bookmarks created migration code that transfers the existing data into the new table * Allow downloading more content from a webpage and index it #215 removed the old asset columns from the database updated the UI to use the data from the linkBookmarkAssets array * generalize the assets table to not be linked in particular to links * fix migrations post merge * fix missing asset ids in the getBookmarks call --------- Co-authored-by: MohamedBassem <me@mbassem.com>
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/crawlerWorker.ts100
1 files changed, 71 insertions, 29 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index eedb7b1e..e7e5515c 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -27,7 +27,13 @@ import { withTimeout } from "utils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
-import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema";
+import {
+ assets,
+ AssetTypes,
+ bookmarkAssets,
+ bookmarkLinks,
+ bookmarks,
+} from "@hoarder/db/schema";
import {
ASSET_TYPES,
deleteAsset,
@@ -194,7 +200,10 @@ async function changeBookmarkStatus(
async function getBookmarkDetails(bookmarkId: string) {
const bookmark = await db.query.bookmarks.findFirst({
where: eq(bookmarks.id, bookmarkId),
- with: { link: true },
+ with: {
+ link: true,
+ assets: true,
+ },
});
if (!bookmark || !bookmark.link) {
@@ -203,9 +212,15 @@ async function getBookmarkDetails(bookmarkId: string) {
return {
url: bookmark.link.url,
userId: bookmark.userId,
- screenshotAssetId: bookmark.link.screenshotAssetId,
- imageAssetId: bookmark.link.imageAssetId,
- fullPageArchiveAssetId: bookmark.link.fullPageArchiveAssetId,
+ screenshotAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
+ )?.id,
+ imageAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
+ )?.id,
+ fullPageArchiveAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ )?.id,
};
}
@@ -490,9 +505,9 @@ async function crawlAndParseUrl(
userId: string,
jobId: string,
bookmarkId: string,
- oldScreenshotAssetId: string | null,
- oldImageAssetId: string | null,
- oldFullPageArchiveAssetId: string | null,
+ oldScreenshotAssetId: string | undefined,
+ oldImageAssetId: string | undefined,
+ oldFullPageArchiveAssetId: string | undefined,
) {
const {
htmlContent,
@@ -511,20 +526,42 @@ async function crawlAndParseUrl(
}
// TODO(important): Restrict the size of content to store
- await db
- .update(bookmarkLinks)
- .set({
- title: meta.title,
- description: meta.description,
- imageUrl: meta.image,
- favicon: meta.logo,
- content: readableContent?.textContent,
- htmlContent: readableContent?.content,
- screenshotAssetId,
- imageAssetId,
- crawledAt: new Date(),
- })
- .where(eq(bookmarkLinks.id, bookmarkId));
+ await db.transaction(async (txn) => {
+ await txn
+ .update(bookmarkLinks)
+ .set({
+ title: meta.title,
+ description: meta.description,
+ imageUrl: meta.image,
+ favicon: meta.logo,
+ content: readableContent?.textContent,
+ htmlContent: readableContent?.content,
+ crawledAt: new Date(),
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
+ if (screenshotAssetId) {
+ if (oldScreenshotAssetId) {
+ await txn.delete(assets).where(eq(assets.id, oldScreenshotAssetId));
+ }
+ await txn.insert(assets).values({
+ id: screenshotAssetId,
+ assetType: AssetTypes.LINK_SCREENSHOT,
+ bookmarkId,
+ });
+ }
+
+ if (imageAssetId) {
+ if (oldImageAssetId) {
+ await txn.delete(assets).where(eq(assets.id, oldImageAssetId));
+ }
+ await txn.insert(assets).values({
+ id: imageAssetId,
+ assetType: AssetTypes.LINK_BANNER_IMAGE,
+ bookmarkId,
+ });
+ }
+ });
// Delete the old assets if any
await Promise.all([
@@ -545,13 +582,18 @@ async function crawlAndParseUrl(
jobId,
);
- await db
- .update(bookmarkLinks)
- .set({
- fullPageArchiveAssetId,
- })
- .where(eq(bookmarkLinks.id, bookmarkId));
-
+ await db.transaction(async (txn) => {
+ if (oldFullPageArchiveAssetId) {
+ await txn
+ .delete(assets)
+ .where(eq(assets.id, oldFullPageArchiveAssetId));
+ }
+ await txn.insert(assets).values({
+ id: fullPageArchiveAssetId,
+ assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ bookmarkId,
+ });
+ });
if (oldFullPageArchiveAssetId) {
deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
() => ({}),