aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/metascraper-plugins/metascraper-reddit.ts
blob: 1fbee3ead9fd2cdc04649bf19eff9ca579bce9ed (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import type { Rules } from "metascraper";

import logger from "@karakeep/shared/logger";

/**
 * This is a metascraper plugin to select a better
 * 'image' attribute for Reddit links, specifically
 * those sharing images. It will also extract the
 * Post Title for a Reddit post instead of use the
 * default.
 *
 * As of writing this, Reddit posts do not define
 * an open-graph image (og:image) attribute, so
 * metascraper resorts to looking for images in
 * the HTML DOM, and selects the first one.
 *
 * In Reddit posts, the first image is typically
 * the profile picture of the OP, which Karakeep
 * is using for the thumbnail.
 *
 * This metascraper plugin instead looks for images
 * with the domain i.redd.it, on which Reddit hosts
 * their preview images for posts. If this plugin
 * finds an i.redd.it image, it provides that for
 * the image metadata.
 *
 * If there is not a matching image, this plugin
 * will return 'undefined' and the next plugin
 * should continue to attempt to extract images.
 *
 * Note: there is another way to accomplish this.
 * If '.json' is appended to a Reddit url, the
 * server will provide a JSON document summarizing
 * the post. If there are preview images, they are
 * included in a section of the JSON. To prevent
 * additional server requests, this method is not
 * currently being used.
 **/

const domainFromUrl = (url: string): string => {
  /**
   * First-party metascraper plugins import metascraper-helpers,
   * which exposes a parseUrl function from the tldtr package.
   * This function does similar to the 'domainWithoutSuffix'
   * field from the tldtr package, without requiring any
   * additional packages.
   **/
  try {
    // Create a URL instance to parse the hostname
    const hostname = new URL(url).hostname;
    const parts = hostname.split(".");
    // Return the part before the TLD (assuming at least two segments)
    // For example, "www.example.com" -> ["www", "example", "com"]
    if (parts.length >= 2) {
      return parts[parts.length - 2];
    }
    return hostname;
  } catch (error) {
    logger.error(
      "[MetascraperReddit] Test>domainFromUrl received an invalid URL:",
      error,
    );
    return "";
  }
};

const test = ({ url }: { url: string }): boolean =>
  domainFromUrl(url).toLowerCase() === "reddit";

const metascraperReddit = () => {
  const rules: Rules = {
    pkgName: "metascraper-reddit",
    test,
    image: ({ htmlDom }) => {
      // 'preview' subdomain images are more likely to be what we're after
      // but it could be in the 'i' subdomain.
      // returns undefined if neither exists
      const previewImages = htmlDom('img[src*="preview.redd.it"]')
        .map((i, el) => htmlDom(el).attr("src"))
        .get();
      const iImages = htmlDom('img[src*="i.redd.it"]')
        .map((i, el) => htmlDom(el).attr("src"))
        .get();
      return previewImages[0] || iImages[0];
    },
    title: ({ htmlDom }) => {
      const title: string | undefined = htmlDom("shreddit-title[title]")
        .first()
        .attr("title");
      const postTitle: string | undefined =
        title ??
        htmlDom("shreddit-post[post-title]").first().attr("post-title");
      return postTitle ? postTitle.trim() : undefined;
    },
  };

  return rules;
};

export default metascraperReddit;