diff options
| author | kamtschatka <simon.schatka@gmx.at> | 2024-10-28 02:51:00 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-28 01:51:00 +0000 |
| commit | 4a13c36da50f6b3171d817edebefe96ba85dc666 (patch) | |
| tree | 60ff553426493e7ae2460e73c3500a5525ba735c /apps/workers/crawlerWorker.ts | |
| parent | 3b7451f4d0727d597c0af0e602f0c74cf58999af (diff) | |
| download | karakeep-4a13c36da50f6b3171d817edebefe96ba85dc666.tar.zst | |
feature: Archive videos using yt-dlp. Fixes #215 (#525)
* Allow downloading more content from a webpage and index it #215
Added a worker that allows downloading videos depending on the environment variables
refactored the code a bit
added new video asset
updated documentation
* Some tweaks
* Drop the dependency on the yt-dlp wrapper
* Update openapi specs
* Dont log an error when the url is not supported
* Better handle supported websites that dont download anything
---------
Co-authored-by: Mohamed Bassem <me@mbassem.com>
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 59 |
1 files changed, 10 insertions, 49 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index ca0f6608..d5bc555e 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -23,9 +23,10 @@ import puppeteer from "puppeteer-extra"; import AdblockerPlugin from "puppeteer-extra-plugin-adblocker"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; +import { getBookmarkDetails, updateAsset } from "workerUtils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; -import { db, HoarderDBTransaction } from "@hoarder/db"; +import { db } from "@hoarder/db"; import { assets, AssetTypes, @@ -35,12 +36,12 @@ import { } from "@hoarder/db/schema"; import { ASSET_TYPES, - deleteAsset, getAssetSize, IMAGE_ASSET_TYPES, newAssetId, saveAsset, saveAssetFromFile, + silentDeleteAsset, SUPPORTED_UPLOAD_ASSET_TYPES, } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; @@ -49,6 +50,7 @@ import { LinkCrawlerQueue, OpenAIQueue, triggerSearchReindex, + triggerVideoWorker, zCrawlLinkRequestSchema, } from "@hoarder/shared/queues"; import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; @@ -207,33 +209,6 @@ async function changeBookmarkStatus( .where(eq(bookmarkLinks.id, bookmarkId)); } -async function getBookmarkDetails(bookmarkId: string) { - const bookmark = await db.query.bookmarks.findFirst({ - where: eq(bookmarks.id, bookmarkId), - with: { - link: true, - assets: true, - }, - }); - - if (!bookmark || !bookmark.link) { - throw new Error("The bookmark either doesn't exist or is not a link"); - } - return { - url: bookmark.link.url, - userId: bookmark.userId, - screenshotAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_SCREENSHOT, - )?.id, - imageAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE, - )?.id, - fullPageArchiveAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE, - )?.id, - }; -} - /** * This provides some "basic" protection from malicious URLs. However, all of those * can be easily circumvented by pointing dns of origin to localhost, or with @@ -609,12 +584,8 @@ async function crawlAndParseUrl( // Delete the old assets if any await Promise.all([ - oldScreenshotAssetId - ? deleteAsset({ userId, assetId: oldScreenshotAssetId }).catch(() => ({})) - : {}, - oldImageAssetId - ? deleteAsset({ userId, assetId: oldImageAssetId }).catch(() => ({})) - : {}, + silentDeleteAsset(userId, oldScreenshotAssetId), + silentDeleteAsset(userId, oldImageAssetId), ]); return async () => { @@ -641,9 +612,7 @@ async function crawlAndParseUrl( ); }); if (oldFullPageArchiveAssetId) { - await deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( - () => ({}), - ); + silentDeleteAsset(userId, oldFullPageArchiveAssetId); } } }; @@ -713,17 +682,9 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { // Update the search index await triggerSearchReindex(bookmarkId); + // Trigger a potential download of a video from the URL + await triggerVideoWorker(bookmarkId, url); + // Do the archival as a separate last step as it has the potential for failure await archivalLogic(); } - -async function updateAsset( - oldAssetId: string | undefined, - newAsset: DBAssetType, - txn: HoarderDBTransaction, -) { - if (oldAssetId) { - await txn.delete(assets).where(eq(assets.id, oldAssetId)); - } - await txn.insert(assets).values(newAsset); -} |
