feat: use reddit API for metadata extraction. Fixes #1853 #1883

author: Mohamed Bassem <me@mbassem.com> 2025-12-13 22:09:57 +0000
committer: Mohamed Bassem <me@mbassem.com> 2025-12-13 23:34:19 +0000
commit: f5c32d940ea6c2a6da6c225c2de1eba13b49f9e0 (patch)
tree: 81d8458e6a92fe1f93226583ab30f1d3495c276c /apps
parent: d6dd8ebdb614d39890810ff9fbc8d71d35af4f03 (diff)
download: karakeep-f5c32d940ea6c2a6da6c225c2de1eba13b49f9e0.tar.zst
3 files changed, 343 insertions, 33 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts
index 1fbee3ea..a5de5fe3 100644
--- a/apps/workers/metascraper-plugins/metascraper-reddit.ts
+++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts
@@ -1,4 +1,8 @@
-import type { Rules } from "metascraper";
+import type { CheerioAPI } from "cheerio";
+import type { Rules, RulesOptions } from "metascraper";
+import { decode as decodeHtmlEntities } from "html-entities";
+import { fetchWithProxy } from "network";
+import { z } from "zod";
 
 import logger from "@karakeep/shared/logger";
 
@@ -28,15 +32,267 @@ import logger from "@karakeep/shared/logger";
  * will return 'undefined' and the next plugin
  * should continue to attempt to extract images.
  *
- * Note: there is another way to accomplish this.
- * If '.json' is appended to a Reddit url, the
- * server will provide a JSON document summarizing
- * the post. If there are preview images, they are
- * included in a section of the JSON. To prevent
- * additional server requests, this method is not
- * currently being used.
+ * We also attempt to fetch the Reddit JSON response
+ * (by appending '.json' to the URL) to grab the
+ * title and preview images directly from the API.
  **/
 
+const redditPreviewImageSchema = z.object({
+  source: z.object({ url: z.string().optional() }).optional(),
+  resolutions: z.array(z.object({ url: z.string().optional() })).optional(),
+});
+
+const redditMediaMetadataItemSchema = z.object({
+  s: z.object({ u: z.string().optional() }).optional(),
+  p: z.array(z.object({ u: z.string().optional() })).optional(),
+});
+
+const redditPostSchema = z.object({
+  title: z.string().optional(),
+  preview: z
+    .object({ images: z.array(redditPreviewImageSchema).optional() })
+    .optional(),
+  url_overridden_by_dest: z.string().optional(),
+  url: z.string().optional(),
+  thumbnail: z.string().optional(),
+  media_metadata: z.record(redditMediaMetadataItemSchema).optional(),
+  author: z.string().optional(),
+  created_utc: z.number().optional(),
+  selftext: z.string().nullish(),
+  selftext_html: z.string().nullish(),
+  subreddit_name_prefixed: z.string().optional(),
+});
+
+type RedditPostData = z.infer<typeof redditPostSchema>;
+
+const redditResponseSchema = z.array(
+  z.object({
+    data: z.object({
+      children: z.array(z.object({ data: redditPostSchema })).optional(),
+    }),
+  }),
+);
+
+interface RedditFetchResult {
+  fetched: boolean;
+  post?: RedditPostData;
+}
+
+const REDDIT_CACHE_TTL_MS = 60 * 1000; // 1 minute TTL to avoid stale data
+
+interface RedditCacheEntry {
+  expiresAt: number;
+  promise: Promise<RedditFetchResult>;
+}
+
+const redditJsonCache = new Map<string, RedditCacheEntry>();
+
+const purgeExpiredCacheEntries = (now: number) => {
+  for (const [key, entry] of redditJsonCache.entries()) {
+    if (entry.expiresAt <= now) {
+      redditJsonCache.delete(key);
+    }
+  }
+};
+
+const decodeRedditUrl = (url?: string): string | undefined => {
+  if (!url) {
+    return undefined;
+  }
+  const decoded = decodeHtmlEntities(url);
+  return decoded || undefined;
+};
+
+const buildJsonUrl = (url: string): string => {
+  const urlObj = new URL(url);
+
+  if (!urlObj.pathname.endsWith(".json")) {
+    urlObj.pathname = urlObj.pathname.replace(/\/?$/, ".json");
+  }
+
+  return urlObj.toString();
+};
+
+const extractImageFromMediaMetadata = (
+  media_metadata?: RedditPostData["media_metadata"],
+): string | undefined => {
+  if (!media_metadata) {
+    return undefined;
+  }
+  const firstItem = Object.values(media_metadata)[0];
+  if (!firstItem) {
+    return undefined;
+  }
+
+  return (
+    decodeRedditUrl(firstItem.s?.u) ??
+    decodeRedditUrl(firstItem.p?.[0]?.u) ??
+    undefined
+  );
+};
+
+const isRedditImageHost = (urlCandidate: string): boolean => {
+  try {
+    const hostname = new URL(urlCandidate).hostname;
+    return hostname.includes("redd.it");
+  } catch {
+    return false;
+  }
+};
+
+const extractImageFromPost = (post: RedditPostData): string | undefined => {
+  const previewImage = post.preview?.images?.[0];
+  const previewUrl =
+    decodeRedditUrl(previewImage?.source?.url) ??
+    decodeRedditUrl(previewImage?.resolutions?.[0]?.url);
+  if (previewUrl) {
+    return previewUrl;
+  }
+
+  const mediaUrl = extractImageFromMediaMetadata(post.media_metadata);
+  if (mediaUrl) {
+    return mediaUrl;
+  }
+
+  const directUrl =
+    decodeRedditUrl(post.url_overridden_by_dest) ??
+    decodeRedditUrl(post.url) ??
+    decodeRedditUrl(post.thumbnail);
+
+  if (directUrl && isRedditImageHost(directUrl)) {
+    return directUrl;
+  }
+
+  return undefined;
+};
+
+const extractTitleFromPost = (post: RedditPostData): string | undefined =>
+  post.title?.trim() || undefined;
+
+const extractAuthorFromPost = (post: RedditPostData): string | undefined =>
+  post.author?.trim() || undefined;
+
+const extractDateFromPost = (post: RedditPostData): string | undefined => {
+  if (!post.created_utc) {
+    return undefined;
+  }
+  const date = new Date(post.created_utc * 1000);
+  return Number.isNaN(date.getTime()) ? undefined : date.toISOString();
+};
+
+const extractPublisherFromPost = (post: RedditPostData): string | undefined =>
+  post.subreddit_name_prefixed?.trim() || "Reddit";
+
+const REDDIT_LOGO_URL =
+  "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png";
+
+const fallbackDomImage = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
+  // 'preview' subdomain images are more likely to be what we're after
+  // but it could be in the 'i' subdomain.
+  // returns undefined if neither exists
+  const previewImages = htmlDom('img[src*="preview.redd.it"]')
+    .map((_, el) => htmlDom(el).attr("src"))
+    .get();
+  const iImages = htmlDom('img[src*="i.redd.it"]')
+    .map((_, el) => htmlDom(el).attr("src"))
+    .get();
+  return previewImages[0] || iImages[0];
+};
+
+const fallbackDomTitle = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
+  const title: string | undefined = htmlDom("shreddit-title[title]")
+    .first()
+    .attr("title");
+  const postTitle: string | undefined =
+    title ?? htmlDom("shreddit-post[post-title]").first().attr("post-title");
+  return postTitle ? postTitle.trim() : undefined;
+};
+
+const fetchRedditPostData = async (url: string): Promise<RedditFetchResult> => {
+  const cached = redditJsonCache.get(url);
+  const now = Date.now();
+
+  purgeExpiredCacheEntries(now);
+
+  if (cached && cached.expiresAt > now) {
+    return cached.promise;
+  }
+
+  const promise = (async () => {
+    let jsonUrl: string;
+    try {
+      jsonUrl = buildJsonUrl(url);
+    } catch (error) {
+      logger.warn(
+        "[MetascraperReddit] Failed to construct Reddit JSON URL",
+        error,
+      );
+      return { fetched: false };
+    }
+
+    let response;
+    try {
+      response = await fetchWithProxy(jsonUrl, {
+        headers: { accept: "application/json" },
+      });
+    } catch (error) {
+      logger.warn(
+        `[MetascraperReddit] Failed to fetch Reddit JSON for ${jsonUrl}`,
+        error,
+      );
+      return { fetched: false };
+    }
+
+    if (response.status === 403) {
+      // API forbidden; fall back to DOM scraping.
+      return { fetched: false };
+    }
+
+    if (!response.ok) {
+      logger.warn(
+        `[MetascraperReddit] Reddit JSON request failed for ${jsonUrl} with status ${response.status}`,
+      );
+      return { fetched: false };
+    }
+
+    let payload: unknown;
+    try {
+      payload = await response.json();
+    } catch (error) {
+      logger.warn(
+        `[MetascraperReddit] Failed to parse Reddit JSON for ${jsonUrl}`,
+        error,
+      );
+      return { fetched: false };
+    }
+
+    const parsed = redditResponseSchema.safeParse(payload);
+    if (!parsed.success) {
+      logger.warn(
+        "[MetascraperReddit] Reddit JSON schema validation failed",
+        parsed.error,
+      );
+      return { fetched: false };
+    }
+
+    const firstListingWithChildren = parsed.data.find(
+      (listing) => (listing.data.children?.length ?? 0) > 0,
+    );
+
+    return {
+      fetched: true,
+      post: firstListingWithChildren?.data.children?.[0]?.data,
+    };
+  })();
+
+  redditJsonCache.set(url, {
+    promise,
+    expiresAt: now + REDDIT_CACHE_TTL_MS,
+  });
+
+  return promise;
+};
+
 const domainFromUrl = (url: string): string => {
   /**
    * First-party metascraper plugins import metascraper-helpers,
@@ -71,27 +327,71 @@ const metascraperReddit = () => {
   const rules: Rules = {
     pkgName: "metascraper-reddit",
     test,
-    image: ({ htmlDom }) => {
-      // 'preview' subdomain images are more likely to be what we're after
-      // but it could be in the 'i' subdomain.
-      // returns undefined if neither exists
-      const previewImages = htmlDom('img[src*="preview.redd.it"]')
-        .map((i, el) => htmlDom(el).attr("src"))
-        .get();
-      const iImages = htmlDom('img[src*="i.redd.it"]')
-        .map((i, el) => htmlDom(el).attr("src"))
-        .get();
-      return previewImages[0] || iImages[0];
-    },
-    title: ({ htmlDom }) => {
-      const title: string | undefined = htmlDom("shreddit-title[title]")
-        .first()
-        .attr("title");
-      const postTitle: string | undefined =
-        title ??
-        htmlDom("shreddit-post[post-title]").first().attr("post-title");
-      return postTitle ? postTitle.trim() : undefined;
-    },
+    image: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        const redditImage = extractImageFromPost(result.post);
+        if (redditImage) {
+          return redditImage;
+        }
+      }
+
+      // If we successfully fetched JSON but found no Reddit image,
+      // avoid falling back to random DOM images.
+      if (result.fetched) {
+        return undefined;
+      }
+
+      return fallbackDomImage({ htmlDom });
+    }) as unknown as RulesOptions,
+    title: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        const redditTitle = extractTitleFromPost(result.post);
+        if (redditTitle) {
+          return redditTitle;
+        }
+      }
+
+      return fallbackDomTitle({ htmlDom });
+    }) as unknown as RulesOptions,
+    author: (async ({ url }: { url: string }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        return extractAuthorFromPost(result.post);
+      }
+      return undefined;
+    }) as unknown as RulesOptions,
+    datePublished: (async ({ url }: { url: string }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        return extractDateFromPost(result.post);
+      }
+      return undefined;
+    }) as unknown as RulesOptions,
+    publisher: (async ({ url }: { url: string }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        return extractPublisherFromPost(result.post);
+      }
+      return undefined;
+    }) as unknown as RulesOptions,
+    logo: (async ({ url }: { url: string }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        return REDDIT_LOGO_URL;
+      }
+      return undefined;
+    }) as unknown as RulesOptions,
+    readableContentHtml: (async ({ url }: { url: string }) => {
+      const result = await fetchRedditPostData(url);
+      if (result.post) {
+        const decoded = decodeHtmlEntities(result.post.selftext_html ?? "");
+        // The post has no content, return the title
+        return (decoded || result.post.title) ?? null;
+      }
+      return undefined;
+    }) as unknown as RulesOptions,
   };
 
   return rules;
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 7a5a1c81..df6c8618 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -22,6 +22,7 @@
     "drizzle-orm": "^0.44.2",
     "execa": "9.3.1",
     "hono": "^4.10.6",
+    "html-entities": "^2.6.0",
     "http-proxy-agent": "^7.0.2",
     "https-proxy-agent": "^7.0.6",
     "ipaddr.js": "^2.2.0",
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index a2495423..2c96338a 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1128,10 +1128,19 @@ async function crawlAndParseUrl(
   ]);
   abortSignal.throwIfAborted();
 
-  let readableContent = await Promise.race([
-    extractReadableContent(htmlContent, browserUrl, jobId),
-    abortPromise(abortSignal),
-  ]);
+  let readableContent: { content: string } | null = meta.readableContentHtml
+    ? { content: meta.readableContentHtml }
+    : null;
+  if (!readableContent) {
+    readableContent = await Promise.race([
+      extractReadableContent(
+        meta.contentHtml ?? htmlContent,
+        browserUrl,
+        jobId,
+      ),
+      abortPromise(abortSignal),
+    ]);
+  }
   abortSignal.throwIfAborted();
 
   const screenshotAssetInfo = await Promise.race([
author	Mohamed Bassem <me@mbassem.com>	2025-12-13 22:09:57 +0000
committer	Mohamed Bassem <me@mbassem.com>	2025-12-13 23:34:19 +0000
commit	f5c32d940ea6c2a6da6c225c2de1eba13b49f9e0 (patch)
tree	81d8458e6a92fe1f93226583ab30f1d3495c276c /apps
parent	d6dd8ebdb614d39890810ff9fbc8d71d35af4f03 (diff)
download	karakeep-f5c32d940ea6c2a6da6c225c2de1eba13b49f9e0.tar.zst