From 7cc4b08aab654818933d655ee3cbd2db217090a2 Mon Sep 17 00:00:00 2001 From: David Woods Date: Sun, 22 Jun 2025 16:14:43 -0400 Subject: feat(workers): adding a local metascraper plugin for Reddit posts (#1302) * chore: metascraper 5.x comes with its own types, including @types/metascraper is now redundant; also updating to latest versions of metascraper libraries * feat (workers): creating a local metascraper plugin for Reddit posts In the past, the preview images for bookmarks from Reddit links were poorly chosen. Reddit does not use opengraph tags, so metascraper-images simply looked for all images on the page and returned the first. This tended to be the profile picture for the poster for the Reddit link. This new plugin, using the existing metascraper framework, provides a better selection of image for the bookmark when the URL domain is 'reddit'. In addition, recent changes (I believe this was a side effect of adding the metascraper-author and/or the metascaper-publisher plugins, but it could also be related to the metascraper-readibility plugin) broke what used to be a good choice of bookmark title. Previously, titles looked like 'Tinyauth just reached 1000 stars! : r/selfhosted' with both thread title and subreddit mentioned. After this update, all Reddit posts now have the same title: 'The heart of the internet'. To return to the better format, this new metascraper-reddit plugin now attempts to retrieve the better title from reddit URLs. Note that in order to gain precendence in title selection, the 'metascraperReddit()' inclusion in the crawlerWorkers.ts metascraper instantiation list had to be moved above metascraperReadability(). * chore: updated Hoarder in text to Karakeep * chore: update metascraper versions fix for metascraper types has been merged; the expect-error comment can be removed * chore: merge with master --------- Co-authored-by: Mohamed Bassem --- .../metascraper-plugins/metascraper-reddit.ts | 100 +++++++++++++++++++++ apps/workers/package.json | 25 +++--- apps/workers/workers/crawlerWorker.ts | 3 + 3 files changed, 115 insertions(+), 13 deletions(-) create mode 100644 apps/workers/metascraper-plugins/metascraper-reddit.ts (limited to 'apps') diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts new file mode 100644 index 00000000..1fbee3ea --- /dev/null +++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts @@ -0,0 +1,100 @@ +import type { Rules } from "metascraper"; + +import logger from "@karakeep/shared/logger"; + +/** + * This is a metascraper plugin to select a better + * 'image' attribute for Reddit links, specifically + * those sharing images. It will also extract the + * Post Title for a Reddit post instead of use the + * default. + * + * As of writing this, Reddit posts do not define + * an open-graph image (og:image) attribute, so + * metascraper resorts to looking for images in + * the HTML DOM, and selects the first one. + * + * In Reddit posts, the first image is typically + * the profile picture of the OP, which Karakeep + * is using for the thumbnail. + * + * This metascraper plugin instead looks for images + * with the domain i.redd.it, on which Reddit hosts + * their preview images for posts. If this plugin + * finds an i.redd.it image, it provides that for + * the image metadata. + * + * If there is not a matching image, this plugin + * will return 'undefined' and the next plugin + * should continue to attempt to extract images. + * + * Note: there is another way to accomplish this. + * If '.json' is appended to a Reddit url, the + * server will provide a JSON document summarizing + * the post. If there are preview images, they are + * included in a section of the JSON. To prevent + * additional server requests, this method is not + * currently being used. + **/ + +const domainFromUrl = (url: string): string => { + /** + * First-party metascraper plugins import metascraper-helpers, + * which exposes a parseUrl function from the tldtr package. + * This function does similar to the 'domainWithoutSuffix' + * field from the tldtr package, without requiring any + * additional packages. + **/ + try { + // Create a URL instance to parse the hostname + const hostname = new URL(url).hostname; + const parts = hostname.split("."); + // Return the part before the TLD (assuming at least two segments) + // For example, "www.example.com" -> ["www", "example", "com"] + if (parts.length >= 2) { + return parts[parts.length - 2]; + } + return hostname; + } catch (error) { + logger.error( + "[MetascraperReddit] Test>domainFromUrl received an invalid URL:", + error, + ); + return ""; + } +}; + +const test = ({ url }: { url: string }): boolean => + domainFromUrl(url).toLowerCase() === "reddit"; + +const metascraperReddit = () => { + const rules: Rules = { + pkgName: "metascraper-reddit", + test, + image: ({ htmlDom }) => { + // 'preview' subdomain images are more likely to be what we're after + // but it could be in the 'i' subdomain. + // returns undefined if neither exists + const previewImages = htmlDom('img[src*="preview.redd.it"]') + .map((i, el) => htmlDom(el).attr("src")) + .get(); + const iImages = htmlDom('img[src*="i.redd.it"]') + .map((i, el) => htmlDom(el).attr("src")) + .get(); + return previewImages[0] || iImages[0]; + }, + title: ({ htmlDom }) => { + const title: string | undefined = htmlDom("shreddit-title[title]") + .first() + .attr("title"); + const postTitle: string | undefined = + title ?? + htmlDom("shreddit-post[post-title]").first().attr("post-title"); + return postTitle ? postTitle.trim() : undefined; + }, + }; + + return rules; +}; + +export default metascraperReddit; diff --git a/apps/workers/package.json b/apps/workers/package.json index 2ed6f9df..595a6e00 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -18,19 +18,19 @@ "execa": "9.3.1", "jsdom": "^24.0.0", "liteque": "^0.3.2", - "metascraper": "^5.46.5", - "metascraper-amazon": "^5.45.22", - "metascraper-author": "5.46.5", - "metascraper-date": "^5.46.5", - "metascraper-description": "^5.45.22", - "metascraper-image": "^5.45.22", - "metascraper-logo": "^5.45.22", - "metascraper-logo-favicon": "^5.45.22", - "metascraper-publisher": "^5.46.5", - "metascraper-readability": "^5.45.22", - "metascraper-title": "^5.45.22", + "metascraper": "^5.46.18", + "metascraper-amazon": "^5.46.18", + "metascraper-author": "5.46.18", + "metascraper-date": "^5.46.18", + "metascraper-description": "^5.46.18", + "metascraper-image": "^5.46.18", + "metascraper-logo": "^5.46.18", + "metascraper-logo-favicon": "^5.46.18", + "metascraper-publisher": "^5.46.18", + "metascraper-readability": "^5.46.18", + "metascraper-title": "^5.46.18", "metascraper-twitter": "^5.45.6", - "metascraper-url": "^5.45.22", + "metascraper-url": "^5.46.18", "node-cron": "^3.0.3", "node-fetch": "^3.3.2", "pdf2json": "^3.1.5", @@ -48,7 +48,6 @@ "devDependencies": { "@karakeep/prettier-config": "workspace:^0.1.0", "@types/jsdom": "^21.1.6", - "@types/metascraper": "^5.14.3", "@types/node-cron": "^3.0.11" }, "scripts": { diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index d884d149..bf083ff6 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -63,12 +63,15 @@ import { } from "@karakeep/shared/queues"; import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; +import metascraperReddit from "../metascraper-plugins/metascraper-reddit"; + const metascraperParser = metascraper([ metascraperDate({ dateModified: true, datePublished: true, }), metascraperAmazon(), + metascraperReddit(), metascraperReadability(), metascraperAuthor(), metascraperPublisher(), -- cgit v1.2.3-70-g09d2