apps/workers/metascraper-plugins/metascraper-reddit.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400

import type { CheerioAPI } from "cheerio";
import type { Rules, RulesOptions } from "metascraper";
import { decode as decodeHtmlEntities } from "html-entities";
import { fetchWithProxy } from "network";
import { z } from "zod";

import logger from "@karakeep/shared/logger";

/**
 * This is a metascraper plugin to select a better
 * 'image' attribute for Reddit links, specifically
 * those sharing images. It will also extract the
 * Post Title for a Reddit post instead of use the
 * default.
 *
 * As of writing this, Reddit posts do not define
 * an open-graph image (og:image) attribute, so
 * metascraper resorts to looking for images in
 * the HTML DOM, and selects the first one.
 *
 * In Reddit posts, the first image is typically
 * the profile picture of the OP, which Karakeep
 * is using for the thumbnail.
 *
 * This metascraper plugin instead looks for images
 * with the domain i.redd.it, on which Reddit hosts
 * their preview images for posts. If this plugin
 * finds an i.redd.it image, it provides that for
 * the image metadata.
 *
 * If there is not a matching image, this plugin
 * will return 'undefined' and the next plugin
 * should continue to attempt to extract images.
 *
 * We also attempt to fetch the Reddit JSON response
 * (by appending '.json' to the URL) to grab the
 * title and preview images directly from the API.
 **/

const redditPreviewImageSchema = z.object({
  source: z.object({ url: z.string().optional() }).optional(),
  resolutions: z.array(z.object({ url: z.string().optional() })).optional(),
});

const redditMediaMetadataItemSchema = z.object({
  s: z.object({ u: z.string().optional() }).optional(),
  p: z.array(z.object({ u: z.string().optional() })).optional(),
});

const redditPostSchema = z.object({
  title: z.string().optional(),
  preview: z
    .object({ images: z.array(redditPreviewImageSchema).optional() })
    .optional(),
  url_overridden_by_dest: z.string().optional(),
  url: z.string().optional(),
  thumbnail: z.string().optional(),
  media_metadata: z.record(redditMediaMetadataItemSchema).optional(),
  author: z.string().optional(),
  created_utc: z.number().optional(),
  selftext: z.string().nullish(),
  selftext_html: z.string().nullish(),
  subreddit_name_prefixed: z.string().optional(),
});

type RedditPostData = z.infer<typeof redditPostSchema>;

const redditResponseSchema = z.array(
  z.object({
    data: z.object({
      children: z.array(z.object({ data: redditPostSchema })).optional(),
    }),
  }),
);

interface RedditFetchResult {
  fetched: boolean;
  post?: RedditPostData;
}

const REDDIT_CACHE_TTL_MS = 60 * 1000; // 1 minute TTL to avoid stale data

interface RedditCacheEntry {
  expiresAt: number;
  promise: Promise<RedditFetchResult>;
}

const redditJsonCache = new Map<string, RedditCacheEntry>();

const purgeExpiredCacheEntries = (now: number) => {
  for (const [key, entry] of redditJsonCache.entries()) {
    if (entry.expiresAt <= now) {
      redditJsonCache.delete(key);
    }
  }
};

const decodeRedditUrl = (url?: string): string | undefined => {
  if (!url) {
    return undefined;
  }
  const decoded = decodeHtmlEntities(url);
  return decoded || undefined;
};

const buildJsonUrl = (url: string): string => {
  const urlObj = new URL(url);

  if (!urlObj.pathname.endsWith(".json")) {
    urlObj.pathname = urlObj.pathname.replace(/\/?$/, ".json");
  }

  return urlObj.toString();
};

const extractImageFromMediaMetadata = (
  media_metadata?: RedditPostData["media_metadata"],
): string | undefined => {
  if (!media_metadata) {
    return undefined;
  }
  const firstItem = Object.values(media_metadata)[0];
  if (!firstItem) {
    return undefined;
  }

  return (
    decodeRedditUrl(firstItem.s?.u) ??
    decodeRedditUrl(firstItem.p?.[0]?.u) ??
    undefined
  );
};

const isRedditImageHost = (urlCandidate: string): boolean => {
  try {
    const hostname = new URL(urlCandidate).hostname;
    return hostname.includes("redd.it");
  } catch {
    return false;
  }
};

const extractImageFromPost = (post: RedditPostData): string | undefined => {
  const previewImage = post.preview?.images?.[0];
  const previewUrl =
    decodeRedditUrl(previewImage?.source?.url) ??
    decodeRedditUrl(previewImage?.resolutions?.[0]?.url);
  if (previewUrl) {
    return previewUrl;
  }

  const mediaUrl = extractImageFromMediaMetadata(post.media_metadata);
  if (mediaUrl) {
    return mediaUrl;
  }

  const directUrl =
    decodeRedditUrl(post.url_overridden_by_dest) ??
    decodeRedditUrl(post.url) ??
    decodeRedditUrl(post.thumbnail);

  if (directUrl && isRedditImageHost(directUrl)) {
    return directUrl;
  }

  return undefined;
};

const extractTitleFromPost = (post: RedditPostData): string | undefined =>
  post.title?.trim() || undefined;

const extractAuthorFromPost = (post: RedditPostData): string | undefined =>
  post.author?.trim() || undefined;

const extractDateFromPost = (post: RedditPostData): string | undefined => {
  if (!post.created_utc) {
    return undefined;
  }
  const date = new Date(post.created_utc * 1000);
  return Number.isNaN(date.getTime()) ? undefined : date.toISOString();
};

const extractPublisherFromPost = (post: RedditPostData): string | undefined =>
  post.subreddit_name_prefixed?.trim() || "Reddit";

const REDDIT_LOGO_URL =
  "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png";

const fallbackDomImage = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
  // 'preview' subdomain images are more likely to be what we're after
  // but it could be in the 'i' subdomain.
  // returns undefined if neither exists
  const previewImages = htmlDom('img[src*="preview.redd.it"]')
    .map((_, el) => htmlDom(el).attr("src"))
    .get();
  const iImages = htmlDom('img[src*="i.redd.it"]')
    .map((_, el) => htmlDom(el).attr("src"))
    .get();
  return previewImages[0] || iImages[0];
};

const fallbackDomTitle = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
  const title: string | undefined = htmlDom("shreddit-title[title]")
    .first()
    .attr("title");
  const postTitle: string | undefined =
    title ?? htmlDom("shreddit-post[post-title]").first().attr("post-title");
  return postTitle ? postTitle.trim() : undefined;
};

const fetchRedditPostData = async (url: string): Promise<RedditFetchResult> => {
  const cached = redditJsonCache.get(url);
  const now = Date.now();

  purgeExpiredCacheEntries(now);

  if (cached && cached.expiresAt > now) {
    return cached.promise;
  }

  const promise = (async () => {
    let jsonUrl: string;
    try {
      jsonUrl = buildJsonUrl(url);
    } catch (error) {
      logger.warn(
        "[MetascraperReddit] Failed to construct Reddit JSON URL",
        error,
      );
      return { fetched: false };
    }

    let response;
    try {
      response = await fetchWithProxy(jsonUrl, {
        headers: { accept: "application/json" },
      });
    } catch (error) {
      logger.warn(
        `[MetascraperReddit] Failed to fetch Reddit JSON for ${jsonUrl}`,
        error,
      );
      return { fetched: false };
    }

    if (response.status === 403) {
      // API forbidden; fall back to DOM scraping.
      return { fetched: false };
    }

    if (!response.ok) {
      logger.warn(
        `[MetascraperReddit] Reddit JSON request failed for ${jsonUrl} with status ${response.status}`,
      );
      return { fetched: false };
    }

    let payload: unknown;
    try {
      payload = await response.json();
    } catch (error) {
      logger.warn(
        `[MetascraperReddit] Failed to parse Reddit JSON for ${jsonUrl}`,
        error,
      );
      return { fetched: false };
    }

    const parsed = redditResponseSchema.safeParse(payload);
    if (!parsed.success) {
      logger.warn(
        "[MetascraperReddit] Reddit JSON schema validation failed",
        parsed.error,
      );
      return { fetched: false };
    }

    const firstListingWithChildren = parsed.data.find(
      (listing) => (listing.data.children?.length ?? 0) > 0,
    );

    return {
      fetched: true,
      post: firstListingWithChildren?.data.children?.[0]?.data,
    };
  })();

  redditJsonCache.set(url, {
    promise,
    expiresAt: now + REDDIT_CACHE_TTL_MS,
  });

  return promise;
};

const domainFromUrl = (url: string): string => {
  /**
   * First-party metascraper plugins import metascraper-helpers,
   * which exposes a parseUrl function from the tldtr package.
   * This function does similar to the 'domainWithoutSuffix'
   * field from the tldtr package, without requiring any
   * additional packages.
   **/
  try {
    // Create a URL instance to parse the hostname
    const hostname = new URL(url).hostname;
    const parts = hostname.split(".");
    // Return the part before the TLD (assuming at least two segments)
    // For example, "www.example.com" -> ["www", "example", "com"]
    if (parts.length >= 2) {
      return parts[parts.length - 2];
    }
    return hostname;
  } catch (error) {
    logger.error(
      "[MetascraperReddit] Test>domainFromUrl received an invalid URL:",
      error,
    );
    return "";
  }
};

const test = ({ url }: { url: string }): boolean =>
  domainFromUrl(url).toLowerCase() === "reddit";

const metascraperReddit = () => {
  const rules: Rules = {
    pkgName: "metascraper-reddit",
    test,
    image: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        const redditImage = extractImageFromPost(result.post);
        if (redditImage) {
          return redditImage;
        }
      }

      // If we successfully fetched JSON but found no Reddit image,
      // avoid falling back to random DOM images.
      if (result.fetched) {
        return undefined;
      }

      return fallbackDomImage({ htmlDom });
    }) as unknown as RulesOptions,
    title: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        const redditTitle = extractTitleFromPost(result.post);
        if (redditTitle) {
          return redditTitle;
        }
      }

      return fallbackDomTitle({ htmlDom });
    }) as unknown as RulesOptions,
    author: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return extractAuthorFromPost(result.post);
      }
      return undefined;
    }) as unknown as RulesOptions,
    datePublished: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return extractDateFromPost(result.post);
      }
      return undefined;
    }) as unknown as RulesOptions,
    publisher: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return extractPublisherFromPost(result.post);
      }
      return undefined;
    }) as unknown as RulesOptions,
    logo: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return REDDIT_LOGO_URL;
      }
      return undefined;
    }) as unknown as RulesOptions,
    readableContentHtml: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        const decoded = decodeHtmlEntities(result.post.selftext_html ?? "");
        // The post has no content, return the title
        return (decoded || result.post.title) ?? null;
      }
      return undefined;
    }) as unknown as RulesOptions,
  };

  return rules;
};

export default metascraperReddit;