3 files changed, 115 insertions, 13 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts
new file mode 100644
index 00000000..1fbee3ea
--- /dev/null
+++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts
@@ -0,0 +1,100 @@
+import type { Rules } from "metascraper";
+
+import logger from "@karakeep/shared/logger";
+
+/**
+ * This is a metascraper plugin to select a better
+ * 'image' attribute for Reddit links, specifically
+ * those sharing images. It will also extract the
+ * Post Title for a Reddit post instead of use the
+ * default.
+ *
+ * As of writing this, Reddit posts do not define
+ * an open-graph image (og:image) attribute, so
+ * metascraper resorts to looking for images in
+ * the HTML DOM, and selects the first one.
+ *
+ * In Reddit posts, the first image is typically
+ * the profile picture of the OP, which Karakeep
+ * is using for the thumbnail.
+ *
+ * This metascraper plugin instead looks for images
+ * with the domain i.redd.it, on which Reddit hosts
+ * their preview images for posts. If this plugin
+ * finds an i.redd.it image, it provides that for
+ * the image metadata.
+ *
+ * If there is not a matching image, this plugin
+ * will return 'undefined' and the next plugin
+ * should continue to attempt to extract images.
+ *
+ * Note: there is another way to accomplish this.
+ * If '.json' is appended to a Reddit url, the
+ * server will provide a JSON document summarizing
+ * the post. If there are preview images, they are
+ * included in a section of the JSON. To prevent
+ * additional server requests, this method is not
+ * currently being used.
+ **/
+
+const domainFromUrl = (url: string): string => {
+  /**
+   * First-party metascraper plugins import metascraper-helpers,
+   * which exposes a parseUrl function from the tldtr package.
+   * This function does similar to the 'domainWithoutSuffix'
+   * field from the tldtr package, without requiring any
+   * additional packages.
+   **/
+  try {
+    // Create a URL instance to parse the hostname
+    const hostname = new URL(url).hostname;
+    const parts = hostname.split(".");
+    // Return the part before the TLD (assuming at least two segments)
+    // For example, "www.example.com" -> ["www", "example", "com"]
+    if (parts.length >= 2) {
+      return parts[parts.length - 2];
+    }
+    return hostname;
+  } catch (error) {
+    logger.error(
+      "[MetascraperReddit] Test>domainFromUrl received an invalid URL:",
+      error,
+    );
+    return "";
+  }
+};
+
+const test = ({ url }: { url: string }): boolean =>
+  domainFromUrl(url).toLowerCase() === "reddit";
+
+const metascraperReddit = () => {
+  const rules: Rules = {
+    pkgName: "metascraper-reddit",
+    test,
+    image: ({ htmlDom }) => {
+      // 'preview' subdomain images are more likely to be what we're after
+      // but it could be in the 'i' subdomain.
+      // returns undefined if neither exists
+      const previewImages = htmlDom('img[src*="preview.redd.it"]')
+        .map((i, el) => htmlDom(el).attr("src"))
+        .get();
+      const iImages = htmlDom('img[src*="i.redd.it"]')
+        .map((i, el) => htmlDom(el).attr("src"))
+        .get();
+      return previewImages[0] || iImages[0];
+    },
+    title: ({ htmlDom }) => {
+      const title: string | undefined = htmlDom("shreddit-title[title]")
+        .first()
+        .attr("title");
+      const postTitle: string | undefined =
+        title ??
+        htmlDom("shreddit-post[post-title]").first().attr("post-title");
+      return postTitle ? postTitle.trim() : undefined;
+    },
+  };
+
+  return rules;
+};
+
+export default metascraperReddit;
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 2ed6f9df..595a6e00 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -18,19 +18,19 @@
     "execa": "9.3.1",
     "jsdom": "^24.0.0",
     "liteque": "^0.3.2",
-    "metascraper": "^5.46.5",
-    "metascraper-amazon": "^5.45.22",
-    "metascraper-author": "5.46.5",
-    "metascraper-date": "^5.46.5",
-    "metascraper-description": "^5.45.22",
-    "metascraper-image": "^5.45.22",
-    "metascraper-logo": "^5.45.22",
-    "metascraper-logo-favicon": "^5.45.22",
-    "metascraper-publisher": "^5.46.5",
-    "metascraper-readability": "^5.45.22",
-    "metascraper-title": "^5.45.22",
+    "metascraper": "^5.46.18",
+    "metascraper-amazon": "^5.46.18",
+    "metascraper-author": "5.46.18",
+    "metascraper-date": "^5.46.18",
+    "metascraper-description": "^5.46.18",
+    "metascraper-image": "^5.46.18",
+    "metascraper-logo": "^5.46.18",
+    "metascraper-logo-favicon": "^5.46.18",
+    "metascraper-publisher": "^5.46.18",
+    "metascraper-readability": "^5.46.18",
+    "metascraper-title": "^5.46.18",
     "metascraper-twitter": "^5.45.6",
-    "metascraper-url": "^5.45.22",
+    "metascraper-url": "^5.46.18",
     "node-cron": "^3.0.3",
     "node-fetch": "^3.3.2",
     "pdf2json": "^3.1.5",
@@ -48,7 +48,6 @@
   "devDependencies": {
     "@karakeep/prettier-config": "workspace:^0.1.0",
     "@types/jsdom": "^21.1.6",
-    "@types/metascraper": "^5.14.3",
     "@types/node-cron": "^3.0.11"
   },
   "scripts": {
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index d884d149..bf083ff6 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -63,12 +63,15 @@ import {
 } from "@karakeep/shared/queues";
 import { BookmarkTypes } from "@karakeep/shared/types/bookmarks";
 
+import metascraperReddit from "../metascraper-plugins/metascraper-reddit";
+
 const metascraperParser = metascraper([
   metascraperDate({
     dateModified: true,
     datePublished: true,
   }),
   metascraperAmazon(),
+  metascraperReddit(),
   metascraperReadability(),
   metascraperAuthor(),
   metascraperPublisher(),