aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers')
-rw-r--r--apps/workers/metascraper-plugins/metascraper-reddit.ts100
-rw-r--r--apps/workers/package.json25
-rw-r--r--apps/workers/workers/crawlerWorker.ts3
3 files changed, 115 insertions, 13 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts
new file mode 100644
index 00000000..1fbee3ea
--- /dev/null
+++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts
@@ -0,0 +1,100 @@
+import type { Rules } from "metascraper";
+
+import logger from "@karakeep/shared/logger";
+
+/**
+ * This is a metascraper plugin to select a better
+ * 'image' attribute for Reddit links, specifically
+ * those sharing images. It will also extract the
+ * Post Title for a Reddit post instead of use the
+ * default.
+ *
+ * As of writing this, Reddit posts do not define
+ * an open-graph image (og:image) attribute, so
+ * metascraper resorts to looking for images in
+ * the HTML DOM, and selects the first one.
+ *
+ * In Reddit posts, the first image is typically
+ * the profile picture of the OP, which Karakeep
+ * is using for the thumbnail.
+ *
+ * This metascraper plugin instead looks for images
+ * with the domain i.redd.it, on which Reddit hosts
+ * their preview images for posts. If this plugin
+ * finds an i.redd.it image, it provides that for
+ * the image metadata.
+ *
+ * If there is not a matching image, this plugin
+ * will return 'undefined' and the next plugin
+ * should continue to attempt to extract images.
+ *
+ * Note: there is another way to accomplish this.
+ * If '.json' is appended to a Reddit url, the
+ * server will provide a JSON document summarizing
+ * the post. If there are preview images, they are
+ * included in a section of the JSON. To prevent
+ * additional server requests, this method is not
+ * currently being used.
+ **/
+
+const domainFromUrl = (url: string): string => {
+ /**
+ * First-party metascraper plugins import metascraper-helpers,
+ * which exposes a parseUrl function from the tldtr package.
+ * This function does similar to the 'domainWithoutSuffix'
+ * field from the tldtr package, without requiring any
+ * additional packages.
+ **/
+ try {
+ // Create a URL instance to parse the hostname
+ const hostname = new URL(url).hostname;
+ const parts = hostname.split(".");
+ // Return the part before the TLD (assuming at least two segments)
+ // For example, "www.example.com" -> ["www", "example", "com"]
+ if (parts.length >= 2) {
+ return parts[parts.length - 2];
+ }
+ return hostname;
+ } catch (error) {
+ logger.error(
+ "[MetascraperReddit] Test>domainFromUrl received an invalid URL:",
+ error,
+ );
+ return "";
+ }
+};
+
+const test = ({ url }: { url: string }): boolean =>
+ domainFromUrl(url).toLowerCase() === "reddit";
+
+const metascraperReddit = () => {
+ const rules: Rules = {
+ pkgName: "metascraper-reddit",
+ test,
+ image: ({ htmlDom }) => {
+ // 'preview' subdomain images are more likely to be what we're after
+ // but it could be in the 'i' subdomain.
+ // returns undefined if neither exists
+ const previewImages = htmlDom('img[src*="preview.redd.it"]')
+ .map((i, el) => htmlDom(el).attr("src"))
+ .get();
+ const iImages = htmlDom('img[src*="i.redd.it"]')
+ .map((i, el) => htmlDom(el).attr("src"))
+ .get();
+ return previewImages[0] || iImages[0];
+ },
+ title: ({ htmlDom }) => {
+ const title: string | undefined = htmlDom("shreddit-title[title]")
+ .first()
+ .attr("title");
+ const postTitle: string | undefined =
+ title ??
+ htmlDom("shreddit-post[post-title]").first().attr("post-title");
+ return postTitle ? postTitle.trim() : undefined;
+ },
+ };
+
+ return rules;
+};
+
+export default metascraperReddit;
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 2ed6f9df..595a6e00 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -18,19 +18,19 @@
"execa": "9.3.1",
"jsdom": "^24.0.0",
"liteque": "^0.3.2",
- "metascraper": "^5.46.5",
- "metascraper-amazon": "^5.45.22",
- "metascraper-author": "5.46.5",
- "metascraper-date": "^5.46.5",
- "metascraper-description": "^5.45.22",
- "metascraper-image": "^5.45.22",
- "metascraper-logo": "^5.45.22",
- "metascraper-logo-favicon": "^5.45.22",
- "metascraper-publisher": "^5.46.5",
- "metascraper-readability": "^5.45.22",
- "metascraper-title": "^5.45.22",
+ "metascraper": "^5.46.18",
+ "metascraper-amazon": "^5.46.18",
+ "metascraper-author": "5.46.18",
+ "metascraper-date": "^5.46.18",
+ "metascraper-description": "^5.46.18",
+ "metascraper-image": "^5.46.18",
+ "metascraper-logo": "^5.46.18",
+ "metascraper-logo-favicon": "^5.46.18",
+ "metascraper-publisher": "^5.46.18",
+ "metascraper-readability": "^5.46.18",
+ "metascraper-title": "^5.46.18",
"metascraper-twitter": "^5.45.6",
- "metascraper-url": "^5.45.22",
+ "metascraper-url": "^5.46.18",
"node-cron": "^3.0.3",
"node-fetch": "^3.3.2",
"pdf2json": "^3.1.5",
@@ -48,7 +48,6 @@
"devDependencies": {
"@karakeep/prettier-config": "workspace:^0.1.0",
"@types/jsdom": "^21.1.6",
- "@types/metascraper": "^5.14.3",
"@types/node-cron": "^3.0.11"
},
"scripts": {
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index d884d149..bf083ff6 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -63,12 +63,15 @@ import {
} from "@karakeep/shared/queues";
import { BookmarkTypes } from "@karakeep/shared/types/bookmarks";
+import metascraperReddit from "../metascraper-plugins/metascraper-reddit";
+
const metascraperParser = metascraper([
metascraperDate({
dateModified: true,
datePublished: true,
}),
metascraperAmazon(),
+ metascraperReddit(),
metascraperReadability(),
metascraperAuthor(),
metascraperPublisher(),