aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/metascraper-plugins
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers/metascraper-plugins')
-rw-r--r--apps/workers/metascraper-plugins/metascraper-reddit.ts100
1 files changed, 100 insertions, 0 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts
new file mode 100644
index 00000000..1fbee3ea
--- /dev/null
+++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts
@@ -0,0 +1,100 @@
+import type { Rules } from "metascraper";
+
+import logger from "@karakeep/shared/logger";
+
+/**
+ * This is a metascraper plugin to select a better
+ * 'image' attribute for Reddit links, specifically
+ * those sharing images. It will also extract the
+ * Post Title for a Reddit post instead of use the
+ * default.
+ *
+ * As of writing this, Reddit posts do not define
+ * an open-graph image (og:image) attribute, so
+ * metascraper resorts to looking for images in
+ * the HTML DOM, and selects the first one.
+ *
+ * In Reddit posts, the first image is typically
+ * the profile picture of the OP, which Karakeep
+ * is using for the thumbnail.
+ *
+ * This metascraper plugin instead looks for images
+ * with the domain i.redd.it, on which Reddit hosts
+ * their preview images for posts. If this plugin
+ * finds an i.redd.it image, it provides that for
+ * the image metadata.
+ *
+ * If there is not a matching image, this plugin
+ * will return 'undefined' and the next plugin
+ * should continue to attempt to extract images.
+ *
+ * Note: there is another way to accomplish this.
+ * If '.json' is appended to a Reddit url, the
+ * server will provide a JSON document summarizing
+ * the post. If there are preview images, they are
+ * included in a section of the JSON. To prevent
+ * additional server requests, this method is not
+ * currently being used.
+ **/
+
+const domainFromUrl = (url: string): string => {
+ /**
+ * First-party metascraper plugins import metascraper-helpers,
+ * which exposes a parseUrl function from the tldtr package.
+ * This function does similar to the 'domainWithoutSuffix'
+ * field from the tldtr package, without requiring any
+ * additional packages.
+ **/
+ try {
+ // Create a URL instance to parse the hostname
+ const hostname = new URL(url).hostname;
+ const parts = hostname.split(".");
+ // Return the part before the TLD (assuming at least two segments)
+ // For example, "www.example.com" -> ["www", "example", "com"]
+ if (parts.length >= 2) {
+ return parts[parts.length - 2];
+ }
+ return hostname;
+ } catch (error) {
+ logger.error(
+ "[MetascraperReddit] Test>domainFromUrl received an invalid URL:",
+ error,
+ );
+ return "";
+ }
+};
+
+const test = ({ url }: { url: string }): boolean =>
+ domainFromUrl(url).toLowerCase() === "reddit";
+
+const metascraperReddit = () => {
+ const rules: Rules = {
+ pkgName: "metascraper-reddit",
+ test,
+ image: ({ htmlDom }) => {
+ // 'preview' subdomain images are more likely to be what we're after
+ // but it could be in the 'i' subdomain.
+ // returns undefined if neither exists
+ const previewImages = htmlDom('img[src*="preview.redd.it"]')
+ .map((i, el) => htmlDom(el).attr("src"))
+ .get();
+ const iImages = htmlDom('img[src*="i.redd.it"]')
+ .map((i, el) => htmlDom(el).attr("src"))
+ .get();
+ return previewImages[0] || iImages[0];
+ },
+ title: ({ htmlDom }) => {
+ const title: string | undefined = htmlDom("shreddit-title[title]")
+ .first()
+ .attr("title");
+ const postTitle: string | undefined =
+ title ??
+ htmlDom("shreddit-post[post-title]").first().attr("post-title");
+ return postTitle ? postTitle.trim() : undefined;
+ },
+ };
+
+ return rules;
+};
+
+export default metascraperReddit;