diff options
| author | David Woods <david@birnamdesigns.com> | 2025-06-22 16:14:43 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-06-22 21:14:43 +0100 |
| commit | 7cc4b08aab654818933d655ee3cbd2db217090a2 (patch) | |
| tree | a5c9dcded3b952bcaccb42a752f7bfa2eeb850fb /apps | |
| parent | 112aa9d942ef0f8548c3728e6218c27cc335a601 (diff) | |
| download | karakeep-7cc4b08aab654818933d655ee3cbd2db217090a2.tar.zst | |
feat(workers): adding a local metascraper plugin for Reddit posts (#1302)
* chore: metascraper 5.x comes with its own types, including @types/metascraper is now redundant; also updating to latest versions of metascraper libraries
* feat (workers): creating a local metascraper plugin for Reddit posts
In the past, the preview images for bookmarks from Reddit links were
poorly chosen. Reddit does not use opengraph tags, so metascraper-images
simply looked for all images on the page and returned the first. This
tended to be the profile picture for the poster for the Reddit link.
This new plugin, using the existing metascraper framework, provides a
better selection of image for the bookmark when the URL domain is
'reddit'.
In addition, recent changes (I believe this was a side effect of adding
the metascraper-author and/or the metascaper-publisher plugins, but it
could also be related to the metascraper-readibility plugin) broke what
used to be a good choice of bookmark title. Previously, titles looked
like 'Tinyauth just reached 1000 stars! : r/selfhosted' with both thread
title and subreddit mentioned. After this update, all Reddit posts now
have the same title: 'The heart of the internet'.
To return to the better format, this new metascraper-reddit plugin now
attempts to retrieve the better title from reddit URLs. Note that in
order to gain precendence in title selection, the 'metascraperReddit()'
inclusion in the crawlerWorkers.ts metascraper instantiation list had to
be moved above metascraperReadability().
* chore: updated Hoarder in text to Karakeep
* chore: update metascraper versions
fix for metascraper types has been merged; the expect-error comment can
be removed
* chore: merge with master
---------
Co-authored-by: Mohamed Bassem <me@mbassem.com>
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/metascraper-plugins/metascraper-reddit.ts | 100 | ||||
| -rw-r--r-- | apps/workers/package.json | 25 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 3 |
3 files changed, 115 insertions, 13 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts new file mode 100644 index 00000000..1fbee3ea --- /dev/null +++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts @@ -0,0 +1,100 @@ +import type { Rules } from "metascraper"; + +import logger from "@karakeep/shared/logger"; + +/** + * This is a metascraper plugin to select a better + * 'image' attribute for Reddit links, specifically + * those sharing images. It will also extract the + * Post Title for a Reddit post instead of use the + * default. + * + * As of writing this, Reddit posts do not define + * an open-graph image (og:image) attribute, so + * metascraper resorts to looking for images in + * the HTML DOM, and selects the first one. + * + * In Reddit posts, the first image is typically + * the profile picture of the OP, which Karakeep + * is using for the thumbnail. + * + * This metascraper plugin instead looks for images + * with the domain i.redd.it, on which Reddit hosts + * their preview images for posts. If this plugin + * finds an i.redd.it image, it provides that for + * the image metadata. + * + * If there is not a matching image, this plugin + * will return 'undefined' and the next plugin + * should continue to attempt to extract images. + * + * Note: there is another way to accomplish this. + * If '.json' is appended to a Reddit url, the + * server will provide a JSON document summarizing + * the post. If there are preview images, they are + * included in a section of the JSON. To prevent + * additional server requests, this method is not + * currently being used. + **/ + +const domainFromUrl = (url: string): string => { + /** + * First-party metascraper plugins import metascraper-helpers, + * which exposes a parseUrl function from the tldtr package. + * This function does similar to the 'domainWithoutSuffix' + * field from the tldtr package, without requiring any + * additional packages. + **/ + try { + // Create a URL instance to parse the hostname + const hostname = new URL(url).hostname; + const parts = hostname.split("."); + // Return the part before the TLD (assuming at least two segments) + // For example, "www.example.com" -> ["www", "example", "com"] + if (parts.length >= 2) { + return parts[parts.length - 2]; + } + return hostname; + } catch (error) { + logger.error( + "[MetascraperReddit] Test>domainFromUrl received an invalid URL:", + error, + ); + return ""; + } +}; + +const test = ({ url }: { url: string }): boolean => + domainFromUrl(url).toLowerCase() === "reddit"; + +const metascraperReddit = () => { + const rules: Rules = { + pkgName: "metascraper-reddit", + test, + image: ({ htmlDom }) => { + // 'preview' subdomain images are more likely to be what we're after + // but it could be in the 'i' subdomain. + // returns undefined if neither exists + const previewImages = htmlDom('img[src*="preview.redd.it"]') + .map((i, el) => htmlDom(el).attr("src")) + .get(); + const iImages = htmlDom('img[src*="i.redd.it"]') + .map((i, el) => htmlDom(el).attr("src")) + .get(); + return previewImages[0] || iImages[0]; + }, + title: ({ htmlDom }) => { + const title: string | undefined = htmlDom("shreddit-title[title]") + .first() + .attr("title"); + const postTitle: string | undefined = + title ?? + htmlDom("shreddit-post[post-title]").first().attr("post-title"); + return postTitle ? postTitle.trim() : undefined; + }, + }; + + return rules; +}; + +export default metascraperReddit; diff --git a/apps/workers/package.json b/apps/workers/package.json index 2ed6f9df..595a6e00 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -18,19 +18,19 @@ "execa": "9.3.1", "jsdom": "^24.0.0", "liteque": "^0.3.2", - "metascraper": "^5.46.5", - "metascraper-amazon": "^5.45.22", - "metascraper-author": "5.46.5", - "metascraper-date": "^5.46.5", - "metascraper-description": "^5.45.22", - "metascraper-image": "^5.45.22", - "metascraper-logo": "^5.45.22", - "metascraper-logo-favicon": "^5.45.22", - "metascraper-publisher": "^5.46.5", - "metascraper-readability": "^5.45.22", - "metascraper-title": "^5.45.22", + "metascraper": "^5.46.18", + "metascraper-amazon": "^5.46.18", + "metascraper-author": "5.46.18", + "metascraper-date": "^5.46.18", + "metascraper-description": "^5.46.18", + "metascraper-image": "^5.46.18", + "metascraper-logo": "^5.46.18", + "metascraper-logo-favicon": "^5.46.18", + "metascraper-publisher": "^5.46.18", + "metascraper-readability": "^5.46.18", + "metascraper-title": "^5.46.18", "metascraper-twitter": "^5.45.6", - "metascraper-url": "^5.45.22", + "metascraper-url": "^5.46.18", "node-cron": "^3.0.3", "node-fetch": "^3.3.2", "pdf2json": "^3.1.5", @@ -48,7 +48,6 @@ "devDependencies": { "@karakeep/prettier-config": "workspace:^0.1.0", "@types/jsdom": "^21.1.6", - "@types/metascraper": "^5.14.3", "@types/node-cron": "^3.0.11" }, "scripts": { diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index d884d149..bf083ff6 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -63,12 +63,15 @@ import { } from "@karakeep/shared/queues"; import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; +import metascraperReddit from "../metascraper-plugins/metascraper-reddit"; + const metascraperParser = metascraper([ metascraperDate({ dateModified: true, datePublished: true, }), metascraperAmazon(), + metascraperReddit(), metascraperReadability(), metascraperAuthor(), metascraperPublisher(), |
