diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-12-13 22:09:57 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-12-13 23:34:19 +0000 |
| commit | f5c32d940ea6c2a6da6c225c2de1eba13b49f9e0 (patch) | |
| tree | 81d8458e6a92fe1f93226583ab30f1d3495c276c /apps | |
| parent | d6dd8ebdb614d39890810ff9fbc8d71d35af4f03 (diff) | |
| download | karakeep-f5c32d940ea6c2a6da6c225c2de1eba13b49f9e0.tar.zst | |
feat: use reddit API for metadata extraction. Fixes #1853 #1883
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/metascraper-plugins/metascraper-reddit.ts | 358 | ||||
| -rw-r--r-- | apps/workers/package.json | 1 | ||||
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 17 |
3 files changed, 343 insertions, 33 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-reddit.ts b/apps/workers/metascraper-plugins/metascraper-reddit.ts index 1fbee3ea..a5de5fe3 100644 --- a/apps/workers/metascraper-plugins/metascraper-reddit.ts +++ b/apps/workers/metascraper-plugins/metascraper-reddit.ts @@ -1,4 +1,8 @@ -import type { Rules } from "metascraper"; +import type { CheerioAPI } from "cheerio"; +import type { Rules, RulesOptions } from "metascraper"; +import { decode as decodeHtmlEntities } from "html-entities"; +import { fetchWithProxy } from "network"; +import { z } from "zod"; import logger from "@karakeep/shared/logger"; @@ -28,15 +32,267 @@ import logger from "@karakeep/shared/logger"; * will return 'undefined' and the next plugin * should continue to attempt to extract images. * - * Note: there is another way to accomplish this. - * If '.json' is appended to a Reddit url, the - * server will provide a JSON document summarizing - * the post. If there are preview images, they are - * included in a section of the JSON. To prevent - * additional server requests, this method is not - * currently being used. + * We also attempt to fetch the Reddit JSON response + * (by appending '.json' to the URL) to grab the + * title and preview images directly from the API. **/ +const redditPreviewImageSchema = z.object({ + source: z.object({ url: z.string().optional() }).optional(), + resolutions: z.array(z.object({ url: z.string().optional() })).optional(), +}); + +const redditMediaMetadataItemSchema = z.object({ + s: z.object({ u: z.string().optional() }).optional(), + p: z.array(z.object({ u: z.string().optional() })).optional(), +}); + +const redditPostSchema = z.object({ + title: z.string().optional(), + preview: z + .object({ images: z.array(redditPreviewImageSchema).optional() }) + .optional(), + url_overridden_by_dest: z.string().optional(), + url: z.string().optional(), + thumbnail: z.string().optional(), + media_metadata: z.record(redditMediaMetadataItemSchema).optional(), + author: z.string().optional(), + created_utc: z.number().optional(), + selftext: z.string().nullish(), + selftext_html: z.string().nullish(), + subreddit_name_prefixed: z.string().optional(), +}); + +type RedditPostData = z.infer<typeof redditPostSchema>; + +const redditResponseSchema = z.array( + z.object({ + data: z.object({ + children: z.array(z.object({ data: redditPostSchema })).optional(), + }), + }), +); + +interface RedditFetchResult { + fetched: boolean; + post?: RedditPostData; +} + +const REDDIT_CACHE_TTL_MS = 60 * 1000; // 1 minute TTL to avoid stale data + +interface RedditCacheEntry { + expiresAt: number; + promise: Promise<RedditFetchResult>; +} + +const redditJsonCache = new Map<string, RedditCacheEntry>(); + +const purgeExpiredCacheEntries = (now: number) => { + for (const [key, entry] of redditJsonCache.entries()) { + if (entry.expiresAt <= now) { + redditJsonCache.delete(key); + } + } +}; + +const decodeRedditUrl = (url?: string): string | undefined => { + if (!url) { + return undefined; + } + const decoded = decodeHtmlEntities(url); + return decoded || undefined; +}; + +const buildJsonUrl = (url: string): string => { + const urlObj = new URL(url); + + if (!urlObj.pathname.endsWith(".json")) { + urlObj.pathname = urlObj.pathname.replace(/\/?$/, ".json"); + } + + return urlObj.toString(); +}; + +const extractImageFromMediaMetadata = ( + media_metadata?: RedditPostData["media_metadata"], +): string | undefined => { + if (!media_metadata) { + return undefined; + } + const firstItem = Object.values(media_metadata)[0]; + if (!firstItem) { + return undefined; + } + + return ( + decodeRedditUrl(firstItem.s?.u) ?? + decodeRedditUrl(firstItem.p?.[0]?.u) ?? + undefined + ); +}; + +const isRedditImageHost = (urlCandidate: string): boolean => { + try { + const hostname = new URL(urlCandidate).hostname; + return hostname.includes("redd.it"); + } catch { + return false; + } +}; + +const extractImageFromPost = (post: RedditPostData): string | undefined => { + const previewImage = post.preview?.images?.[0]; + const previewUrl = + decodeRedditUrl(previewImage?.source?.url) ?? + decodeRedditUrl(previewImage?.resolutions?.[0]?.url); + if (previewUrl) { + return previewUrl; + } + + const mediaUrl = extractImageFromMediaMetadata(post.media_metadata); + if (mediaUrl) { + return mediaUrl; + } + + const directUrl = + decodeRedditUrl(post.url_overridden_by_dest) ?? + decodeRedditUrl(post.url) ?? + decodeRedditUrl(post.thumbnail); + + if (directUrl && isRedditImageHost(directUrl)) { + return directUrl; + } + + return undefined; +}; + +const extractTitleFromPost = (post: RedditPostData): string | undefined => + post.title?.trim() || undefined; + +const extractAuthorFromPost = (post: RedditPostData): string | undefined => + post.author?.trim() || undefined; + +const extractDateFromPost = (post: RedditPostData): string | undefined => { + if (!post.created_utc) { + return undefined; + } + const date = new Date(post.created_utc * 1000); + return Number.isNaN(date.getTime()) ? undefined : date.toISOString(); +}; + +const extractPublisherFromPost = (post: RedditPostData): string | undefined => + post.subreddit_name_prefixed?.trim() || "Reddit"; + +const REDDIT_LOGO_URL = + "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png"; + +const fallbackDomImage = ({ htmlDom }: { htmlDom: CheerioAPI }) => { + // 'preview' subdomain images are more likely to be what we're after + // but it could be in the 'i' subdomain. + // returns undefined if neither exists + const previewImages = htmlDom('img[src*="preview.redd.it"]') + .map((_, el) => htmlDom(el).attr("src")) + .get(); + const iImages = htmlDom('img[src*="i.redd.it"]') + .map((_, el) => htmlDom(el).attr("src")) + .get(); + return previewImages[0] || iImages[0]; +}; + +const fallbackDomTitle = ({ htmlDom }: { htmlDom: CheerioAPI }) => { + const title: string | undefined = htmlDom("shreddit-title[title]") + .first() + .attr("title"); + const postTitle: string | undefined = + title ?? htmlDom("shreddit-post[post-title]").first().attr("post-title"); + return postTitle ? postTitle.trim() : undefined; +}; + +const fetchRedditPostData = async (url: string): Promise<RedditFetchResult> => { + const cached = redditJsonCache.get(url); + const now = Date.now(); + + purgeExpiredCacheEntries(now); + + if (cached && cached.expiresAt > now) { + return cached.promise; + } + + const promise = (async () => { + let jsonUrl: string; + try { + jsonUrl = buildJsonUrl(url); + } catch (error) { + logger.warn( + "[MetascraperReddit] Failed to construct Reddit JSON URL", + error, + ); + return { fetched: false }; + } + + let response; + try { + response = await fetchWithProxy(jsonUrl, { + headers: { accept: "application/json" }, + }); + } catch (error) { + logger.warn( + `[MetascraperReddit] Failed to fetch Reddit JSON for ${jsonUrl}`, + error, + ); + return { fetched: false }; + } + + if (response.status === 403) { + // API forbidden; fall back to DOM scraping. + return { fetched: false }; + } + + if (!response.ok) { + logger.warn( + `[MetascraperReddit] Reddit JSON request failed for ${jsonUrl} with status ${response.status}`, + ); + return { fetched: false }; + } + + let payload: unknown; + try { + payload = await response.json(); + } catch (error) { + logger.warn( + `[MetascraperReddit] Failed to parse Reddit JSON for ${jsonUrl}`, + error, + ); + return { fetched: false }; + } + + const parsed = redditResponseSchema.safeParse(payload); + if (!parsed.success) { + logger.warn( + "[MetascraperReddit] Reddit JSON schema validation failed", + parsed.error, + ); + return { fetched: false }; + } + + const firstListingWithChildren = parsed.data.find( + (listing) => (listing.data.children?.length ?? 0) > 0, + ); + + return { + fetched: true, + post: firstListingWithChildren?.data.children?.[0]?.data, + }; + })(); + + redditJsonCache.set(url, { + promise, + expiresAt: now + REDDIT_CACHE_TTL_MS, + }); + + return promise; +}; + const domainFromUrl = (url: string): string => { /** * First-party metascraper plugins import metascraper-helpers, @@ -71,27 +327,71 @@ const metascraperReddit = () => { const rules: Rules = { pkgName: "metascraper-reddit", test, - image: ({ htmlDom }) => { - // 'preview' subdomain images are more likely to be what we're after - // but it could be in the 'i' subdomain. - // returns undefined if neither exists - const previewImages = htmlDom('img[src*="preview.redd.it"]') - .map((i, el) => htmlDom(el).attr("src")) - .get(); - const iImages = htmlDom('img[src*="i.redd.it"]') - .map((i, el) => htmlDom(el).attr("src")) - .get(); - return previewImages[0] || iImages[0]; - }, - title: ({ htmlDom }) => { - const title: string | undefined = htmlDom("shreddit-title[title]") - .first() - .attr("title"); - const postTitle: string | undefined = - title ?? - htmlDom("shreddit-post[post-title]").first().attr("post-title"); - return postTitle ? postTitle.trim() : undefined; - }, + image: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + const redditImage = extractImageFromPost(result.post); + if (redditImage) { + return redditImage; + } + } + + // If we successfully fetched JSON but found no Reddit image, + // avoid falling back to random DOM images. + if (result.fetched) { + return undefined; + } + + return fallbackDomImage({ htmlDom }); + }) as unknown as RulesOptions, + title: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + const redditTitle = extractTitleFromPost(result.post); + if (redditTitle) { + return redditTitle; + } + } + + return fallbackDomTitle({ htmlDom }); + }) as unknown as RulesOptions, + author: (async ({ url }: { url: string }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + return extractAuthorFromPost(result.post); + } + return undefined; + }) as unknown as RulesOptions, + datePublished: (async ({ url }: { url: string }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + return extractDateFromPost(result.post); + } + return undefined; + }) as unknown as RulesOptions, + publisher: (async ({ url }: { url: string }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + return extractPublisherFromPost(result.post); + } + return undefined; + }) as unknown as RulesOptions, + logo: (async ({ url }: { url: string }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + return REDDIT_LOGO_URL; + } + return undefined; + }) as unknown as RulesOptions, + readableContentHtml: (async ({ url }: { url: string }) => { + const result = await fetchRedditPostData(url); + if (result.post) { + const decoded = decodeHtmlEntities(result.post.selftext_html ?? ""); + // The post has no content, return the title + return (decoded || result.post.title) ?? null; + } + return undefined; + }) as unknown as RulesOptions, }; return rules; diff --git a/apps/workers/package.json b/apps/workers/package.json index 7a5a1c81..df6c8618 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -22,6 +22,7 @@ "drizzle-orm": "^0.44.2", "execa": "9.3.1", "hono": "^4.10.6", + "html-entities": "^2.6.0", "http-proxy-agent": "^7.0.2", "https-proxy-agent": "^7.0.6", "ipaddr.js": "^2.2.0", diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index a2495423..2c96338a 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -1128,10 +1128,19 @@ async function crawlAndParseUrl( ]); abortSignal.throwIfAborted(); - let readableContent = await Promise.race([ - extractReadableContent(htmlContent, browserUrl, jobId), - abortPromise(abortSignal), - ]); + let readableContent: { content: string } | null = meta.readableContentHtml + ? { content: meta.readableContentHtml } + : null; + if (!readableContent) { + readableContent = await Promise.race([ + extractReadableContent( + meta.contentHtml ?? htmlContent, + browserUrl, + jobId, + ), + abortPromise(abortSignal), + ]); + } abortSignal.throwIfAborted(); const screenshotAssetInfo = await Promise.race([ |
