From 4a13c36da50f6b3171d817edebefe96ba85dc666 Mon Sep 17 00:00:00 2001 From: kamtschatka Date: Mon, 28 Oct 2024 02:51:00 +0100 Subject: feature: Archive videos using yt-dlp. Fixes #215 (#525) * Allow downloading more content from a webpage and index it #215 Added a worker that allows downloading videos depending on the environment variables refactored the code a bit added new video asset updated documentation * Some tweaks * Drop the dependency on the yt-dlp wrapper * Update openapi specs * Dont log an error when the url is not supported * Better handle supported websites that dont download anything --------- Co-authored-by: Mohamed Bassem --- apps/workers/videoWorker.ts | 202 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 apps/workers/videoWorker.ts (limited to 'apps/workers/videoWorker.ts') diff --git a/apps/workers/videoWorker.ts b/apps/workers/videoWorker.ts new file mode 100644 index 00000000..5448f0fa --- /dev/null +++ b/apps/workers/videoWorker.ts @@ -0,0 +1,202 @@ +import fs from "fs"; +import * as os from "os"; +import path from "path"; +import { execa } from "execa"; +import { DequeuedJob, Runner } from "liteque"; + +import { db } from "@hoarder/db"; +import { AssetTypes } from "@hoarder/db/schema"; +import { + ASSET_TYPES, + getAssetSize, + newAssetId, + saveAssetFromFile, +} from "@hoarder/shared/assetdb"; +import serverConfig from "@hoarder/shared/config"; +import logger from "@hoarder/shared/logger"; +import { VideoWorkerQueue, ZVideoRequest } from "@hoarder/shared/queues"; + +import { withTimeout } from "./utils"; +import { getBookmarkDetails, updateAsset } from "./workerUtils"; + +const TMP_FOLDER = path.join(os.tmpdir(), "video_downloads"); + +export class VideoWorker { + static build() { + logger.info("Starting video worker ..."); + + return new Runner( + VideoWorkerQueue, + { + run: withTimeout( + runWorker, + /* timeoutSec */ serverConfig.crawler.downloadVideoTimeout, + ), + onComplete: async (job) => { + const jobId = job?.id ?? "unknown"; + logger.info( + `[VideoCrawler][${jobId}] Video Download Completed successfully`, + ); + return Promise.resolve(); + }, + onError: async (job) => { + const jobId = job?.id ?? "unknown"; + logger.error( + `[VideoCrawler][${jobId}] Video Download job failed: ${job.error}`, + ); + return Promise.resolve(); + }, + }, + { + pollIntervalMs: 1000, + timeoutSecs: serverConfig.crawler.downloadVideoTimeout, + concurrency: 1, + }, + ); + } +} + +function prepareYtDlpArguments(url: string, assetPath: string) { + // TODO allow custom commandline arguments? + const ytDlpArguments = [url]; + if (serverConfig.crawler.maxVideoDownloadSize > 0) { + ytDlpArguments.push( + "-f", + `best[filesize<${serverConfig.crawler.maxVideoDownloadSize}M]`, + ); + } + ytDlpArguments.push("-o", assetPath); + ytDlpArguments.push("--no-playlist"); + return ytDlpArguments; +} + +async function runWorker(job: DequeuedJob) { + const jobId = job.id ?? "unknown"; + const { bookmarkId } = job.data; + + const { + url, + userId, + videoAssetId: oldVideoAssetId, + } = await getBookmarkDetails(bookmarkId); + + if (!serverConfig.crawler.downloadVideo) { + logger.info( + `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it is disabled in the config.`, + ); + return; + } + + const videoAssetId = newAssetId(); + let assetPath = `${TMP_FOLDER}/${videoAssetId}`; + await fs.promises.mkdir(TMP_FOLDER, { recursive: true }); + + const ytDlpArguments = prepareYtDlpArguments(url, assetPath); + + try { + logger.info( + `[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`, + ); + + await execa`yt-dlp ${ytDlpArguments}`; + const downloadPath = await findAssetFile(videoAssetId); + if (!downloadPath) { + logger.info( + "[VideoCrawler][${jobId}] yt-dlp didn't download anything. Skipping ...", + ); + return; + } + assetPath = downloadPath; + } catch (e) { + const err = e as Error; + if (err.message.includes("ERROR: Unsupported URL:")) { + logger.info( + `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it's not one of the supported yt-dlp URLs`, + ); + return; + } + console.log(JSON.stringify(err)); + logger.error( + `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}"`, + ); + await deleteLeftOverAssetFile(jobId, videoAssetId); + return; + } + + logger.info( + `[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`, + ); + await saveAssetFromFile({ + userId, + assetId: videoAssetId, + assetPath, + metadata: { contentType: ASSET_TYPES.VIDEO_MP4 }, + }); + + await db.transaction(async (txn) => { + await updateAsset( + oldVideoAssetId, + { + id: videoAssetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_VIDEO, + contentType: ASSET_TYPES.VIDEO_MP4, + size: await getAssetSize({ userId, assetId: videoAssetId }), + }, + txn, + ); + }); + + logger.info( + `[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`, + ); +} + +/** + * Deletes leftover assets in case the download fails + * + * @param jobId the id of the job + * @param assetId the id of the asset to delete + */ +async function deleteLeftOverAssetFile( + jobId: string, + assetId: string, +): Promise { + let assetFile; + try { + assetFile = await findAssetFile(assetId); + } catch { + // ignore exception, no asset file was found + return; + } + if (!assetFile) { + return; + } + logger.info( + `[VideoCrawler][${jobId}] Deleting leftover video asset "${assetFile}".`, + ); + try { + await fs.promises.rm(assetFile); + } catch (e) { + logger.error( + `[VideoCrawler][${jobId}] Failed deleting leftover video asset "${assetFile}".`, + ); + } +} + +/** + * yt-dlp automatically adds a file ending to the passed in filename --> we have to search it again in the folder + * + * @param assetId the id of the asset to search + * @returns the path to the downloaded asset + */ +async function findAssetFile(assetId: string): Promise { + const files = await fs.promises.readdir(TMP_FOLDER); + for (const file of files) { + if (file.startsWith(assetId)) { + return path.join(TMP_FOLDER, file); + } + } + return null; +} -- cgit v1.2.3-70-g09d2