diff options
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/web/app/api/assets/[assetId]/route.ts | 32 | ||||
| -rw-r--r-- | apps/web/components/dashboard/preview/AttachmentBox.tsx | 2 | ||||
| -rw-r--r-- | apps/web/components/dashboard/preview/LinkContentSection.tsx | 19 | ||||
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 59 | ||||
| -rw-r--r-- | apps/workers/index.ts | 15 | ||||
| -rw-r--r-- | apps/workers/videoWorker.ts | 202 | ||||
| -rw-r--r-- | apps/workers/workerUtils.ts | 48 |
7 files changed, 319 insertions, 58 deletions
diff --git a/apps/web/app/api/assets/[assetId]/route.ts b/apps/web/app/api/assets/[assetId]/route.ts index 73237d8d..3bff79ba 100644 --- a/apps/web/app/api/assets/[assetId]/route.ts +++ b/apps/web/app/api/assets/[assetId]/route.ts @@ -27,10 +27,30 @@ export async function GET( assetId: params.assetId, }); - return new Response(asset, { - status: 200, - headers: { - "Content-type": metadata.contentType, - }, - }); + const range = request.headers.get("Range"); + if (range) { + const parts = range.replace(/bytes=/, "").split("-"); + const start = parseInt(parts[0], 10); + const end = parts[1] ? parseInt(parts[1], 10) : asset.length - 1; + + // TODO: Don't read the whole asset into memory in the first place + const chunk = asset.subarray(start, end + 1); + return new Response(chunk, { + status: 206, // Partial Content + headers: { + "Content-Range": `bytes ${start}-${end}/${asset.length}`, + "Accept-Ranges": "bytes", + "Content-Length": chunk.length.toString(), + "Content-type": metadata.contentType, + }, + }); + } else { + return new Response(asset, { + status: 200, + headers: { + "Content-Length": asset.length.toString(), + "Content-type": metadata.contentType, + }, + }); + } } diff --git a/apps/web/components/dashboard/preview/AttachmentBox.tsx b/apps/web/components/dashboard/preview/AttachmentBox.tsx index 436f1026..d631f4d9 100644 --- a/apps/web/components/dashboard/preview/AttachmentBox.tsx +++ b/apps/web/components/dashboard/preview/AttachmentBox.tsx @@ -20,6 +20,7 @@ import { Pencil, Plus, Trash2, + Video, } from "lucide-react"; import { @@ -44,6 +45,7 @@ export default function AttachmentBox({ bookmark }: { bookmark: ZBookmark }) { screenshot: <Camera className="size-4" />, fullPageArchive: <Archive className="size-4" />, bannerImage: <Image className="size-4" />, + video: <Video className="size-4" />, bookmarkAsset: <Paperclip className="size-4" />, unknown: <Paperclip className="size-4" />, }; diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx index f2069821..bf0d8f90 100644 --- a/apps/web/components/dashboard/preview/LinkContentSection.tsx +++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx @@ -60,6 +60,20 @@ function CachedContentSection({ link }: { link: ZBookmarkedLink }) { return <ScrollArea className="h-full">{content}</ScrollArea>; } +function VideoSection({ link }: { link: ZBookmarkedLink }) { + return ( + <div className="relative h-full w-full overflow-hidden"> + <div className="absolute inset-0 h-full w-full"> + {/* eslint-disable-next-line jsx-a11y/media-has-caption -- captions not (yet) available */} + <video className="m-auto max-h-full max-w-full" controls> + <source src={`/api/assets/${link.videoAssetId}`} /> + Not supported by your browser + </video> + </div> + </div> + ); +} + export default function LinkContentSection({ bookmark, }: { @@ -76,6 +90,8 @@ export default function LinkContentSection({ content = <CachedContentSection link={bookmark.content} />; } else if (section === "archive") { content = <FullPageArchiveSection link={bookmark.content} />; + } else if (section === "video") { + content = <VideoSection link={bookmark.content} />; } else { content = <ScreenshotSection link={bookmark.content} />; } @@ -101,6 +117,9 @@ export default function LinkContentSection({ > Archive </SelectItem> + <SelectItem value="video" disabled={!bookmark.content.videoAssetId}> + Video + </SelectItem> </SelectGroup> </SelectContent> </Select> diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index ca0f6608..d5bc555e 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -23,9 +23,10 @@ import puppeteer from "puppeteer-extra"; import AdblockerPlugin from "puppeteer-extra-plugin-adblocker"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; +import { getBookmarkDetails, updateAsset } from "workerUtils"; import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; -import { db, HoarderDBTransaction } from "@hoarder/db"; +import { db } from "@hoarder/db"; import { assets, AssetTypes, @@ -35,12 +36,12 @@ import { } from "@hoarder/db/schema"; import { ASSET_TYPES, - deleteAsset, getAssetSize, IMAGE_ASSET_TYPES, newAssetId, saveAsset, saveAssetFromFile, + silentDeleteAsset, SUPPORTED_UPLOAD_ASSET_TYPES, } from "@hoarder/shared/assetdb"; import serverConfig from "@hoarder/shared/config"; @@ -49,6 +50,7 @@ import { LinkCrawlerQueue, OpenAIQueue, triggerSearchReindex, + triggerVideoWorker, zCrawlLinkRequestSchema, } from "@hoarder/shared/queues"; import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; @@ -207,33 +209,6 @@ async function changeBookmarkStatus( .where(eq(bookmarkLinks.id, bookmarkId)); } -async function getBookmarkDetails(bookmarkId: string) { - const bookmark = await db.query.bookmarks.findFirst({ - where: eq(bookmarks.id, bookmarkId), - with: { - link: true, - assets: true, - }, - }); - - if (!bookmark || !bookmark.link) { - throw new Error("The bookmark either doesn't exist or is not a link"); - } - return { - url: bookmark.link.url, - userId: bookmark.userId, - screenshotAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_SCREENSHOT, - )?.id, - imageAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE, - )?.id, - fullPageArchiveAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE, - )?.id, - }; -} - /** * This provides some "basic" protection from malicious URLs. However, all of those * can be easily circumvented by pointing dns of origin to localhost, or with @@ -609,12 +584,8 @@ async function crawlAndParseUrl( // Delete the old assets if any await Promise.all([ - oldScreenshotAssetId - ? deleteAsset({ userId, assetId: oldScreenshotAssetId }).catch(() => ({})) - : {}, - oldImageAssetId - ? deleteAsset({ userId, assetId: oldImageAssetId }).catch(() => ({})) - : {}, + silentDeleteAsset(userId, oldScreenshotAssetId), + silentDeleteAsset(userId, oldImageAssetId), ]); return async () => { @@ -641,9 +612,7 @@ async function crawlAndParseUrl( ); }); if (oldFullPageArchiveAssetId) { - await deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch( - () => ({}), - ); + silentDeleteAsset(userId, oldFullPageArchiveAssetId); } } }; @@ -713,17 +682,9 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) { // Update the search index await triggerSearchReindex(bookmarkId); + // Trigger a potential download of a video from the URL + await triggerVideoWorker(bookmarkId, url); + // Do the archival as a separate last step as it has the potential for failure await archivalLogic(); } - -async function updateAsset( - oldAssetId: string | undefined, - newAsset: DBAssetType, - txn: HoarderDBTransaction, -) { - if (oldAssetId) { - await txn.delete(assets).where(eq(assets.id, oldAssetId)); - } - await txn.insert(assets).values(newAsset); -} diff --git a/apps/workers/index.ts b/apps/workers/index.ts index f9a05e59..3b5896e4 100644 --- a/apps/workers/index.ts +++ b/apps/workers/index.ts @@ -10,30 +10,39 @@ import { CrawlerWorker } from "./crawlerWorker"; import { shutdownPromise } from "./exit"; import { OpenAiWorker } from "./openaiWorker"; import { SearchIndexingWorker } from "./searchWorker"; +import { VideoWorker } from "./videoWorker"; async function main() { logger.info(`Workers version: ${serverConfig.serverVersion ?? "not set"}`); runQueueDBMigrations(); - const [crawler, openai, search, tidyAssets] = [ + const [crawler, openai, search, tidyAssets, video] = [ await CrawlerWorker.build(), OpenAiWorker.build(), SearchIndexingWorker.build(), TidyAssetsWorker.build(), + VideoWorker.build(), ]; await Promise.any([ - Promise.all([crawler.run(), openai.run(), search.run(), tidyAssets.run()]), + Promise.all([ + crawler.run(), + openai.run(), + search.run(), + tidyAssets.run(), + video.run(), + ]), shutdownPromise, ]); logger.info( - "Shutting down crawler, openai, tidyAssets and search workers ...", + "Shutting down crawler, openai, tidyAssets, video and search workers ...", ); crawler.stop(); openai.stop(); search.stop(); tidyAssets.stop(); + video.stop(); } main(); diff --git a/apps/workers/videoWorker.ts b/apps/workers/videoWorker.ts new file mode 100644 index 00000000..5448f0fa --- /dev/null +++ b/apps/workers/videoWorker.ts @@ -0,0 +1,202 @@ +import fs from "fs";
+import * as os from "os";
+import path from "path";
+import { execa } from "execa";
+import { DequeuedJob, Runner } from "liteque";
+
+import { db } from "@hoarder/db";
+import { AssetTypes } from "@hoarder/db/schema";
+import {
+ ASSET_TYPES,
+ getAssetSize,
+ newAssetId,
+ saveAssetFromFile,
+} from "@hoarder/shared/assetdb";
+import serverConfig from "@hoarder/shared/config";
+import logger from "@hoarder/shared/logger";
+import { VideoWorkerQueue, ZVideoRequest } from "@hoarder/shared/queues";
+
+import { withTimeout } from "./utils";
+import { getBookmarkDetails, updateAsset } from "./workerUtils";
+
+const TMP_FOLDER = path.join(os.tmpdir(), "video_downloads");
+
+export class VideoWorker {
+ static build() {
+ logger.info("Starting video worker ...");
+
+ return new Runner<ZVideoRequest>(
+ VideoWorkerQueue,
+ {
+ run: withTimeout(
+ runWorker,
+ /* timeoutSec */ serverConfig.crawler.downloadVideoTimeout,
+ ),
+ onComplete: async (job) => {
+ const jobId = job?.id ?? "unknown";
+ logger.info(
+ `[VideoCrawler][${jobId}] Video Download Completed successfully`,
+ );
+ return Promise.resolve();
+ },
+ onError: async (job) => {
+ const jobId = job?.id ?? "unknown";
+ logger.error(
+ `[VideoCrawler][${jobId}] Video Download job failed: ${job.error}`,
+ );
+ return Promise.resolve();
+ },
+ },
+ {
+ pollIntervalMs: 1000,
+ timeoutSecs: serverConfig.crawler.downloadVideoTimeout,
+ concurrency: 1,
+ },
+ );
+ }
+}
+
+function prepareYtDlpArguments(url: string, assetPath: string) {
+ // TODO allow custom commandline arguments?
+ const ytDlpArguments = [url];
+ if (serverConfig.crawler.maxVideoDownloadSize > 0) {
+ ytDlpArguments.push(
+ "-f",
+ `best[filesize<${serverConfig.crawler.maxVideoDownloadSize}M]`,
+ );
+ }
+ ytDlpArguments.push("-o", assetPath);
+ ytDlpArguments.push("--no-playlist");
+ return ytDlpArguments;
+}
+
+async function runWorker(job: DequeuedJob<ZVideoRequest>) {
+ const jobId = job.id ?? "unknown";
+ const { bookmarkId } = job.data;
+
+ const {
+ url,
+ userId,
+ videoAssetId: oldVideoAssetId,
+ } = await getBookmarkDetails(bookmarkId);
+
+ if (!serverConfig.crawler.downloadVideo) {
+ logger.info(
+ `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it is disabled in the config.`,
+ );
+ return;
+ }
+
+ const videoAssetId = newAssetId();
+ let assetPath = `${TMP_FOLDER}/${videoAssetId}`;
+ await fs.promises.mkdir(TMP_FOLDER, { recursive: true });
+
+ const ytDlpArguments = prepareYtDlpArguments(url, assetPath);
+
+ try {
+ logger.info(
+ `[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`,
+ );
+
+ await execa`yt-dlp ${ytDlpArguments}`;
+ const downloadPath = await findAssetFile(videoAssetId);
+ if (!downloadPath) {
+ logger.info(
+ "[VideoCrawler][${jobId}] yt-dlp didn't download anything. Skipping ...",
+ );
+ return;
+ }
+ assetPath = downloadPath;
+ } catch (e) {
+ const err = e as Error;
+ if (err.message.includes("ERROR: Unsupported URL:")) {
+ logger.info(
+ `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it's not one of the supported yt-dlp URLs`,
+ );
+ return;
+ }
+ console.log(JSON.stringify(err));
+ logger.error(
+ `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}"`,
+ );
+ await deleteLeftOverAssetFile(jobId, videoAssetId);
+ return;
+ }
+
+ logger.info(
+ `[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`,
+ );
+ await saveAssetFromFile({
+ userId,
+ assetId: videoAssetId,
+ assetPath,
+ metadata: { contentType: ASSET_TYPES.VIDEO_MP4 },
+ });
+
+ await db.transaction(async (txn) => {
+ await updateAsset(
+ oldVideoAssetId,
+ {
+ id: videoAssetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_VIDEO,
+ contentType: ASSET_TYPES.VIDEO_MP4,
+ size: await getAssetSize({ userId, assetId: videoAssetId }),
+ },
+ txn,
+ );
+ });
+
+ logger.info(
+ `[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`,
+ );
+}
+
+/**
+ * Deletes leftover assets in case the download fails
+ *
+ * @param jobId the id of the job
+ * @param assetId the id of the asset to delete
+ */
+async function deleteLeftOverAssetFile(
+ jobId: string,
+ assetId: string,
+): Promise<void> {
+ let assetFile;
+ try {
+ assetFile = await findAssetFile(assetId);
+ } catch {
+ // ignore exception, no asset file was found
+ return;
+ }
+ if (!assetFile) {
+ return;
+ }
+ logger.info(
+ `[VideoCrawler][${jobId}] Deleting leftover video asset "${assetFile}".`,
+ );
+ try {
+ await fs.promises.rm(assetFile);
+ } catch (e) {
+ logger.error(
+ `[VideoCrawler][${jobId}] Failed deleting leftover video asset "${assetFile}".`,
+ );
+ }
+}
+
+/**
+ * yt-dlp automatically adds a file ending to the passed in filename --> we have to search it again in the folder
+ *
+ * @param assetId the id of the asset to search
+ * @returns the path to the downloaded asset
+ */
+async function findAssetFile(assetId: string): Promise<string | null> {
+ const files = await fs.promises.readdir(TMP_FOLDER);
+ for (const file of files) {
+ if (file.startsWith(assetId)) {
+ return path.join(TMP_FOLDER, file);
+ }
+ }
+ return null;
+}
diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts new file mode 100644 index 00000000..e93d241b --- /dev/null +++ b/apps/workers/workerUtils.ts @@ -0,0 +1,48 @@ +import { eq } from "drizzle-orm";
+
+import { db, HoarderDBTransaction } from "@hoarder/db";
+import { assets, AssetTypes, bookmarks } from "@hoarder/db/schema";
+
+type DBAssetType = typeof assets.$inferInsert;
+
+export async function updateAsset(
+ oldAssetId: string | undefined,
+ newAsset: DBAssetType,
+ txn: HoarderDBTransaction,
+) {
+ if (oldAssetId) {
+ await txn.delete(assets).where(eq(assets.id, oldAssetId));
+ }
+
+ await txn.insert(assets).values(newAsset);
+}
+
+export async function getBookmarkDetails(bookmarkId: string) {
+ const bookmark = await db.query.bookmarks.findFirst({
+ where: eq(bookmarks.id, bookmarkId),
+ with: {
+ link: true,
+ assets: true,
+ },
+ });
+
+ if (!bookmark || !bookmark.link) {
+ throw new Error("The bookmark either doesn't exist or is not a link");
+ }
+ return {
+ url: bookmark.link.url,
+ userId: bookmark.userId,
+ screenshotAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
+ )?.id,
+ imageAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
+ )?.id,
+ fullPageArchiveAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ )?.id,
+ videoAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_VIDEO,
+ )?.id,
+ };
+}
|
