feature: Archive videos using yt-dlp. Fixes #215 (#525)

* Allow downloading more content from a webpage and index it #215 Added a worker that allows downloading videos depending on the environment variables refactored the code a bit added new video asset updated documentation * Some tweaks * Drop the dependency on the yt-dlp wrapper * Update openapi specs * Dont log an error when the url is not supported * Better handle supported websites that dont download anything --------- Co-authored-by: Mohamed Bassem <me@mbassem.com>
author: kamtschatka <simon.schatka@gmx.at> 2024-10-28 02:51:00 +0100
committer: GitHub <noreply@github.com> 2024-10-28 01:51:00 +0000
commit: 4a13c36da50f6b3171d817edebefe96ba85dc666 (patch)
tree: 60ff553426493e7ae2460e73c3500a5525ba735c /apps/workers/crawlerWorker.ts
parent: 3b7451f4d0727d597c0af0e602f0c74cf58999af (diff)
download: karakeep-4a13c36da50f6b3171d817edebefe96ba85dc666.tar.zst
1 files changed, 10 insertions, 49 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index ca0f6608..d5bc555e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -23,9 +23,10 @@ import puppeteer from "puppeteer-extra";
 import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
 import StealthPlugin from "puppeteer-extra-plugin-stealth";
 import { withTimeout } from "utils";
+import { getBookmarkDetails, updateAsset } from "workerUtils";
 
 import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
-import { db, HoarderDBTransaction } from "@hoarder/db";
+import { db } from "@hoarder/db";
 import {
   assets,
   AssetTypes,
@@ -35,12 +36,12 @@ import {
 } from "@hoarder/db/schema";
 import {
   ASSET_TYPES,
-  deleteAsset,
   getAssetSize,
   IMAGE_ASSET_TYPES,
   newAssetId,
   saveAsset,
   saveAssetFromFile,
+  silentDeleteAsset,
   SUPPORTED_UPLOAD_ASSET_TYPES,
 } from "@hoarder/shared/assetdb";
 import serverConfig from "@hoarder/shared/config";
@@ -49,6 +50,7 @@ import {
   LinkCrawlerQueue,
   OpenAIQueue,
   triggerSearchReindex,
+  triggerVideoWorker,
   zCrawlLinkRequestSchema,
 } from "@hoarder/shared/queues";
 import { BookmarkTypes } from "@hoarder/shared/types/bookmarks";
@@ -207,33 +209,6 @@ async function changeBookmarkStatus(
     .where(eq(bookmarkLinks.id, bookmarkId));
 }
 
-async function getBookmarkDetails(bookmarkId: string) {
-  const bookmark = await db.query.bookmarks.findFirst({
-    where: eq(bookmarks.id, bookmarkId),
-    with: {
-      link: true,
-      assets: true,
-    },
-  });
-
-  if (!bookmark || !bookmark.link) {
-    throw new Error("The bookmark either doesn't exist or is not a link");
-  }
-  return {
-    url: bookmark.link.url,
-    userId: bookmark.userId,
-    screenshotAssetId: bookmark.assets.find(
-      (a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
-    )?.id,
-    imageAssetId: bookmark.assets.find(
-      (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
-    )?.id,
-    fullPageArchiveAssetId: bookmark.assets.find(
-      (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
-    )?.id,
-  };
-}
-
 /**
  * This provides some "basic" protection from malicious URLs. However, all of those
  * can be easily circumvented by pointing dns of origin to localhost, or with
@@ -609,12 +584,8 @@ async function crawlAndParseUrl(
 
   // Delete the old assets if any
   await Promise.all([
-    oldScreenshotAssetId
-      ? deleteAsset({ userId, assetId: oldScreenshotAssetId }).catch(() => ({}))
-      : {},
-    oldImageAssetId
-      ? deleteAsset({ userId, assetId: oldImageAssetId }).catch(() => ({}))
-      : {},
+    silentDeleteAsset(userId, oldScreenshotAssetId),
+    silentDeleteAsset(userId, oldImageAssetId),
   ]);
 
   return async () => {
@@ -641,9 +612,7 @@ async function crawlAndParseUrl(
         );
       });
       if (oldFullPageArchiveAssetId) {
-        await deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
-          () => ({}),
-        );
+        silentDeleteAsset(userId, oldFullPageArchiveAssetId);
       }
     }
   };
@@ -713,17 +682,9 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
   // Update the search index
   await triggerSearchReindex(bookmarkId);
 
+  // Trigger a potential download of a video from the URL
+  await triggerVideoWorker(bookmarkId, url);
+
   // Do the archival as a separate last step as it has the potential for failure
   await archivalLogic();
 }
-
-async function updateAsset(
-  oldAssetId: string | undefined,
-  newAsset: DBAssetType,
-  txn: HoarderDBTransaction,
-) {
-  if (oldAssetId) {
-    await txn.delete(assets).where(eq(assets.id, oldAssetId));
-  }
-  await txn.insert(assets).values(newAsset);
-}
author	kamtschatka <simon.schatka@gmx.at>	2024-10-28 02:51:00 +0100
committer	GitHub <noreply@github.com>	2024-10-28 01:51:00 +0000
commit	4a13c36da50f6b3171d817edebefe96ba85dc666 (patch)
tree	60ff553426493e7ae2460e73c3500a5525ba735c /apps/workers/crawlerWorker.ts
parent	3b7451f4d0727d597c0af0e602f0c74cf58999af (diff)
download	karakeep-4a13c36da50f6b3171d817edebefe96ba85dc666.tar.zst