aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--apps/workers/metascraper-plugins/metascraper-reddit.ts358
-rw-r--r--apps/workers/package.json1
-rw-r--r--apps/workers/workers/crawlerWorker.ts17
-rw-r--r--pnpm-lock.yaml3
4 files changed, 346 insertions, 33 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts
index 1fbee3ea..a5de5fe3 100644
--- a/apps/workers/metascraper-plugins/metascraper-reddit.ts
+++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts
@@ -1,4 +1,8 @@
-import type { Rules } from "metascraper";
+import type { CheerioAPI } from "cheerio";
+import type { Rules, RulesOptions } from "metascraper";
+import { decode as decodeHtmlEntities } from "html-entities";
+import { fetchWithProxy } from "network";
+import { z } from "zod";
import logger from "@karakeep/shared/logger";
@@ -28,15 +32,267 @@ import logger from "@karakeep/shared/logger";
* will return 'undefined' and the next plugin
* should continue to attempt to extract images.
*
- * Note: there is another way to accomplish this.
- * If '.json' is appended to a Reddit url, the
- * server will provide a JSON document summarizing
- * the post. If there are preview images, they are
- * included in a section of the JSON. To prevent
- * additional server requests, this method is not
- * currently being used.
+ * We also attempt to fetch the Reddit JSON response
+ * (by appending '.json' to the URL) to grab the
+ * title and preview images directly from the API.
**/
+const redditPreviewImageSchema = z.object({
+ source: z.object({ url: z.string().optional() }).optional(),
+ resolutions: z.array(z.object({ url: z.string().optional() })).optional(),
+});
+
+const redditMediaMetadataItemSchema = z.object({
+ s: z.object({ u: z.string().optional() }).optional(),
+ p: z.array(z.object({ u: z.string().optional() })).optional(),
+});
+
+const redditPostSchema = z.object({
+ title: z.string().optional(),
+ preview: z
+ .object({ images: z.array(redditPreviewImageSchema).optional() })
+ .optional(),
+ url_overridden_by_dest: z.string().optional(),
+ url: z.string().optional(),
+ thumbnail: z.string().optional(),
+ media_metadata: z.record(redditMediaMetadataItemSchema).optional(),
+ author: z.string().optional(),
+ created_utc: z.number().optional(),
+ selftext: z.string().nullish(),
+ selftext_html: z.string().nullish(),
+ subreddit_name_prefixed: z.string().optional(),
+});
+
+type RedditPostData = z.infer<typeof redditPostSchema>;
+
+const redditResponseSchema = z.array(
+ z.object({
+ data: z.object({
+ children: z.array(z.object({ data: redditPostSchema })).optional(),
+ }),
+ }),
+);
+
+interface RedditFetchResult {
+ fetched: boolean;
+ post?: RedditPostData;
+}
+
+const REDDIT_CACHE_TTL_MS = 60 * 1000; // 1 minute TTL to avoid stale data
+
+interface RedditCacheEntry {
+ expiresAt: number;
+ promise: Promise<RedditFetchResult>;
+}
+
+const redditJsonCache = new Map<string, RedditCacheEntry>();
+
+const purgeExpiredCacheEntries = (now: number) => {
+ for (const [key, entry] of redditJsonCache.entries()) {
+ if (entry.expiresAt <= now) {
+ redditJsonCache.delete(key);
+ }
+ }
+};
+
+const decodeRedditUrl = (url?: string): string | undefined => {
+ if (!url) {
+ return undefined;
+ }
+ const decoded = decodeHtmlEntities(url);
+ return decoded || undefined;
+};
+
+const buildJsonUrl = (url: string): string => {
+ const urlObj = new URL(url);
+
+ if (!urlObj.pathname.endsWith(".json")) {
+ urlObj.pathname = urlObj.pathname.replace(/\/?$/, ".json");
+ }
+
+ return urlObj.toString();
+};
+
+const extractImageFromMediaMetadata = (
+ media_metadata?: RedditPostData["media_metadata"],
+): string | undefined => {
+ if (!media_metadata) {
+ return undefined;
+ }
+ const firstItem = Object.values(media_metadata)[0];
+ if (!firstItem) {
+ return undefined;
+ }
+
+ return (
+ decodeRedditUrl(firstItem.s?.u) ??
+ decodeRedditUrl(firstItem.p?.[0]?.u) ??
+ undefined
+ );
+};
+
+const isRedditImageHost = (urlCandidate: string): boolean => {
+ try {
+ const hostname = new URL(urlCandidate).hostname;
+ return hostname.includes("redd.it");
+ } catch {
+ return false;
+ }
+};
+
+const extractImageFromPost = (post: RedditPostData): string | undefined => {
+ const previewImage = post.preview?.images?.[0];
+ const previewUrl =
+ decodeRedditUrl(previewImage?.source?.url) ??
+ decodeRedditUrl(previewImage?.resolutions?.[0]?.url);
+ if (previewUrl) {
+ return previewUrl;
+ }
+
+ const mediaUrl = extractImageFromMediaMetadata(post.media_metadata);
+ if (mediaUrl) {
+ return mediaUrl;
+ }
+
+ const directUrl =
+ decodeRedditUrl(post.url_overridden_by_dest) ??
+ decodeRedditUrl(post.url) ??
+ decodeRedditUrl(post.thumbnail);
+
+ if (directUrl && isRedditImageHost(directUrl)) {
+ return directUrl;
+ }
+
+ return undefined;
+};
+
+const extractTitleFromPost = (post: RedditPostData): string | undefined =>
+ post.title?.trim() || undefined;
+
+const extractAuthorFromPost = (post: RedditPostData): string | undefined =>
+ post.author?.trim() || undefined;
+
+const extractDateFromPost = (post: RedditPostData): string | undefined => {
+ if (!post.created_utc) {
+ return undefined;
+ }
+ const date = new Date(post.created_utc * 1000);
+ return Number.isNaN(date.getTime()) ? undefined : date.toISOString();
+};
+
+const extractPublisherFromPost = (post: RedditPostData): string | undefined =>
+ post.subreddit_name_prefixed?.trim() || "Reddit";
+
+const REDDIT_LOGO_URL =
+ "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png";
+
+const fallbackDomImage = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
+ // 'preview' subdomain images are more likely to be what we're after
+ // but it could be in the 'i' subdomain.
+ // returns undefined if neither exists
+ const previewImages = htmlDom('img[src*="preview.redd.it"]')
+ .map((_, el) => htmlDom(el).attr("src"))
+ .get();
+ const iImages = htmlDom('img[src*="i.redd.it"]')
+ .map((_, el) => htmlDom(el).attr("src"))
+ .get();
+ return previewImages[0] || iImages[0];
+};
+
+const fallbackDomTitle = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
+ const title: string | undefined = htmlDom("shreddit-title[title]")
+ .first()
+ .attr("title");
+ const postTitle: string | undefined =
+ title ?? htmlDom("shreddit-post[post-title]").first().attr("post-title");
+ return postTitle ? postTitle.trim() : undefined;
+};
+
+const fetchRedditPostData = async (url: string): Promise<RedditFetchResult> => {
+ const cached = redditJsonCache.get(url);
+ const now = Date.now();
+
+ purgeExpiredCacheEntries(now);
+
+ if (cached && cached.expiresAt > now) {
+ return cached.promise;
+ }
+
+ const promise = (async () => {
+ let jsonUrl: string;
+ try {
+ jsonUrl = buildJsonUrl(url);
+ } catch (error) {
+ logger.warn(
+ "[MetascraperReddit] Failed to construct Reddit JSON URL",
+ error,
+ );
+ return { fetched: false };
+ }
+
+ let response;
+ try {
+ response = await fetchWithProxy(jsonUrl, {
+ headers: { accept: "application/json" },
+ });
+ } catch (error) {
+ logger.warn(
+ `[MetascraperReddit] Failed to fetch Reddit JSON for ${jsonUrl}`,
+ error,
+ );
+ return { fetched: false };
+ }
+
+ if (response.status === 403) {
+ // API forbidden; fall back to DOM scraping.
+ return { fetched: false };
+ }
+
+ if (!response.ok) {
+ logger.warn(
+ `[MetascraperReddit] Reddit JSON request failed for ${jsonUrl} with status ${response.status}`,
+ );
+ return { fetched: false };
+ }
+
+ let payload: unknown;
+ try {
+ payload = await response.json();
+ } catch (error) {
+ logger.warn(
+ `[MetascraperReddit] Failed to parse Reddit JSON for ${jsonUrl}`,
+ error,
+ );
+ return { fetched: false };
+ }
+
+ const parsed = redditResponseSchema.safeParse(payload);
+ if (!parsed.success) {
+ logger.warn(
+ "[MetascraperReddit] Reddit JSON schema validation failed",
+ parsed.error,
+ );
+ return { fetched: false };
+ }
+
+ const firstListingWithChildren = parsed.data.find(
+ (listing) => (listing.data.children?.length ?? 0) > 0,
+ );
+
+ return {
+ fetched: true,
+ post: firstListingWithChildren?.data.children?.[0]?.data,
+ };
+ })();
+
+ redditJsonCache.set(url, {
+ promise,
+ expiresAt: now + REDDIT_CACHE_TTL_MS,
+ });
+
+ return promise;
+};
+
const domainFromUrl = (url: string): string => {
/**
* First-party metascraper plugins import metascraper-helpers,
@@ -71,27 +327,71 @@ const metascraperReddit = () => {
const rules: Rules = {
pkgName: "metascraper-reddit",
test,
- image: ({ htmlDom }) => {
- // 'preview' subdomain images are more likely to be what we're after
- // but it could be in the 'i' subdomain.
- // returns undefined if neither exists
- const previewImages = htmlDom('img[src*="preview.redd.it"]')
- .map((i, el) => htmlDom(el).attr("src"))
- .get();
- const iImages = htmlDom('img[src*="i.redd.it"]')
- .map((i, el) => htmlDom(el).attr("src"))
- .get();
- return previewImages[0] || iImages[0];
- },
- title: ({ htmlDom }) => {
- const title: string | undefined = htmlDom("shreddit-title[title]")
- .first()
- .attr("title");
- const postTitle: string | undefined =
- title ??
- htmlDom("shreddit-post[post-title]").first().attr("post-title");
- return postTitle ? postTitle.trim() : undefined;
- },
+ image: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ const redditImage = extractImageFromPost(result.post);
+ if (redditImage) {
+ return redditImage;
+ }
+ }
+
+ // If we successfully fetched JSON but found no Reddit image,
+ // avoid falling back to random DOM images.
+ if (result.fetched) {
+ return undefined;
+ }
+
+ return fallbackDomImage({ htmlDom });
+ }) as unknown as RulesOptions,
+ title: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ const redditTitle = extractTitleFromPost(result.post);
+ if (redditTitle) {
+ return redditTitle;
+ }
+ }
+
+ return fallbackDomTitle({ htmlDom });
+ }) as unknown as RulesOptions,
+ author: (async ({ url }: { url: string }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ return extractAuthorFromPost(result.post);
+ }
+ return undefined;
+ }) as unknown as RulesOptions,
+ datePublished: (async ({ url }: { url: string }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ return extractDateFromPost(result.post);
+ }
+ return undefined;
+ }) as unknown as RulesOptions,
+ publisher: (async ({ url }: { url: string }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ return extractPublisherFromPost(result.post);
+ }
+ return undefined;
+ }) as unknown as RulesOptions,
+ logo: (async ({ url }: { url: string }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ return REDDIT_LOGO_URL;
+ }
+ return undefined;
+ }) as unknown as RulesOptions,
+ readableContentHtml: (async ({ url }: { url: string }) => {
+ const result = await fetchRedditPostData(url);
+ if (result.post) {
+ const decoded = decodeHtmlEntities(result.post.selftext_html ?? "");
+ // The post has no content, return the title
+ return (decoded || result.post.title) ?? null;
+ }
+ return undefined;
+ }) as unknown as RulesOptions,
};
return rules;
diff --git a/apps/workers/package.json b/apps/workers/package.json
index 7a5a1c81..df6c8618 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -22,6 +22,7 @@
"drizzle-orm": "^0.44.2",
"execa": "9.3.1",
"hono": "^4.10.6",
+ "html-entities": "^2.6.0",
"http-proxy-agent": "^7.0.2",
"https-proxy-agent": "^7.0.6",
"ipaddr.js": "^2.2.0",
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index a2495423..2c96338a 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1128,10 +1128,19 @@ async function crawlAndParseUrl(
]);
abortSignal.throwIfAborted();
- let readableContent = await Promise.race([
- extractReadableContent(htmlContent, browserUrl, jobId),
- abortPromise(abortSignal),
- ]);
+ let readableContent: { content: string } | null = meta.readableContentHtml
+ ? { content: meta.readableContentHtml }
+ : null;
+ if (!readableContent) {
+ readableContent = await Promise.race([
+ extractReadableContent(
+ meta.contentHtml ?? htmlContent,
+ browserUrl,
+ jobId,
+ ),
+ abortPromise(abortSignal),
+ ]);
+ }
abortSignal.throwIfAborted();
const screenshotAssetInfo = await Promise.race([
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 5403cf04..febde7ba 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -835,6 +835,9 @@ importers:
hono:
specifier: ^4.10.6
version: 4.10.6
+ html-entities:
+ specifier: ^2.6.0
+ version: 2.6.0
http-proxy-agent:
specifier: ^7.0.2
version: 7.0.2