diff options
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/metascraper-plugins/metascraper-reddit.ts | 100 | ||||
| -rw-r--r-- | apps/workers/package.json | 25 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 3 |
3 files changed, 115 insertions, 13 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts new file mode 100644 index 00000000..1fbee3ea --- /dev/null +++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts @@ -0,0 +1,100 @@ +import type { Rules } from "metascraper"; + +import logger from "@karakeep/shared/logger"; + +/** + * This is a metascraper plugin to select a better + * 'image' attribute for Reddit links, specifically + * those sharing images. It will also extract the + * Post Title for a Reddit post instead of use the + * default. + * + * As of writing this, Reddit posts do not define + * an open-graph image (og:image) attribute, so + * metascraper resorts to looking for images in + * the HTML DOM, and selects the first one. + * + * In Reddit posts, the first image is typically + * the profile picture of the OP, which Karakeep + * is using for the thumbnail. + * + * This metascraper plugin instead looks for images + * with the domain i.redd.it, on which Reddit hosts + * their preview images for posts. If this plugin + * finds an i.redd.it image, it provides that for + * the image metadata. + * + * If there is not a matching image, this plugin + * will return 'undefined' and the next plugin + * should continue to attempt to extract images. + * + * Note: there is another way to accomplish this. + * If '.json' is appended to a Reddit url, the + * server will provide a JSON document summarizing + * the post. If there are preview images, they are + * included in a section of the JSON. To prevent + * additional server requests, this method is not + * currently being used. + **/ + +const domainFromUrl = (url: string): string => { + /** + * First-party metascraper plugins import metascraper-helpers, + * which exposes a parseUrl function from the tldtr package. + * This function does similar to the 'domainWithoutSuffix' + * field from the tldtr package, without requiring any + * additional packages. + **/ + try { + // Create a URL instance to parse the hostname + const hostname = new URL(url).hostname; + const parts = hostname.split("."); + // Return the part before the TLD (assuming at least two segments) + // For example, "www.example.com" -> ["www", "example", "com"] + if (parts.length >= 2) { + return parts[parts.length - 2]; + } + return hostname; + } catch (error) { + logger.error( + "[MetascraperReddit] Test>domainFromUrl received an invalid URL:", + error, + ); + return ""; + } +}; + +const test = ({ url }: { url: string }): boolean => + domainFromUrl(url).toLowerCase() === "reddit"; + +const metascraperReddit = () => { + const rules: Rules = { + pkgName: "metascraper-reddit", + test, + image: ({ htmlDom }) => { + // 'preview' subdomain images are more likely to be what we're after + // but it could be in the 'i' subdomain. + // returns undefined if neither exists + const previewImages = htmlDom('img[src*="preview.redd.it"]') + .map((i, el) => htmlDom(el).attr("src")) + .get(); + const iImages = htmlDom('img[src*="i.redd.it"]') + .map((i, el) => htmlDom(el).attr("src")) + .get(); + return previewImages[0] || iImages[0]; + }, + title: ({ htmlDom }) => { + const title: string | undefined = htmlDom("shreddit-title[title]") + .first() + .attr("title"); + const postTitle: string | undefined = + title ?? + htmlDom("shreddit-post[post-title]").first().attr("post-title"); + return postTitle ? postTitle.trim() : undefined; + }, + }; + + return rules; +}; + +export default metascraperReddit; diff --git a/apps/workers/package.json b/apps/workers/package.json index 2ed6f9df..595a6e00 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -18,19 +18,19 @@ "execa": "9.3.1", "jsdom": "^24.0.0", "liteque": "^0.3.2", - "metascraper": "^5.46.5", - "metascraper-amazon": "^5.45.22", - "metascraper-author": "5.46.5", - "metascraper-date": "^5.46.5", - "metascraper-description": "^5.45.22", - "metascraper-image": "^5.45.22", - "metascraper-logo": "^5.45.22", - "metascraper-logo-favicon": "^5.45.22", - "metascraper-publisher": "^5.46.5", - "metascraper-readability": "^5.45.22", - "metascraper-title": "^5.45.22", + "metascraper": "^5.46.18", + "metascraper-amazon": "^5.46.18", + "metascraper-author": "5.46.18", + "metascraper-date": "^5.46.18", + "metascraper-description": "^5.46.18", + "metascraper-image": "^5.46.18", + "metascraper-logo": "^5.46.18", + "metascraper-logo-favicon": "^5.46.18", + "metascraper-publisher": "^5.46.18", + "metascraper-readability": "^5.46.18", + "metascraper-title": "^5.46.18", "metascraper-twitter": "^5.45.6", - "metascraper-url": "^5.45.22", + "metascraper-url": "^5.46.18", "node-cron": "^3.0.3", "node-fetch": "^3.3.2", "pdf2json": "^3.1.5", @@ -48,7 +48,6 @@ "devDependencies": { "@karakeep/prettier-config": "workspace:^0.1.0", "@types/jsdom": "^21.1.6", - "@types/metascraper": "^5.14.3", "@types/node-cron": "^3.0.11" }, "scripts": { diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index d884d149..bf083ff6 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -63,12 +63,15 @@ import { } from "@karakeep/shared/queues"; import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; +import metascraperReddit from "../metascraper-plugins/metascraper-reddit"; + const metascraperParser = metascraper([ metascraperDate({ dateModified: true, datePublished: true, }), metascraperAmazon(), + metascraperReddit(), metascraperReadability(), metascraperAuthor(), metascraperPublisher(), |
