import type { CheerioAPI } from "cheerio";
import type { Rules, RulesOptions } from "metascraper";
import { decode as decodeHtmlEntities } from "html-entities";
import { fetchWithProxy } from "network";
import { z } from "zod";

import logger from "@karakeep/shared/logger";

/**
 * This is a metascraper plugin to select a better
 * 'image' attribute for Reddit links, specifically
 * those sharing images. It will also extract the
 * Post Title for a Reddit post instead of use the
 * default.
 *
 * As of writing this, Reddit posts do not define
 * an open-graph image (og:image) attribute, so
 * metascraper resorts to looking for images in
 * the HTML DOM, and selects the first one.
 *
 * In Reddit posts, the first image is typically
 * the profile picture of the OP, which Karakeep
 * is using for the thumbnail.
 *
 * This metascraper plugin instead looks for images
 * with the domain i.redd.it, on which Reddit hosts
 * their preview images for posts. If this plugin
 * finds an i.redd.it image, it provides that for
 * the image metadata.
 *
 * If there is not a matching image, this plugin
 * will return 'undefined' and the next plugin
 * should continue to attempt to extract images.
 *
 * We also attempt to fetch the Reddit JSON response
 * (by appending '.json' to the URL) to grab the
 * title and preview images directly from the API.
 **/

const redditPreviewImageSchema = z.object({
  source: z.object({ url: z.string().optional() }).optional(),
  resolutions: z.array(z.object({ url: z.string().optional() })).optional(),
});

const redditMediaMetadataItemSchema = z.object({
  s: z.object({ u: z.string().optional() }).optional(),
  p: z.array(z.object({ u: z.string().optional() })).optional(),
});

const redditPostSchema = z.object({
  title: z.string().optional(),
  preview: z
    .object({ images: z.array(redditPreviewImageSchema).optional() })
    .optional(),
  url_overridden_by_dest: z.string().optional(),
  url: z.string().optional(),
  thumbnail: z.string().optional(),
  media_metadata: z.record(redditMediaMetadataItemSchema).optional(),
  author: z.string().optional(),
  created_utc: z.number().optional(),
  selftext: z.string().nullish(),
  selftext_html: z.string().nullish(),
  subreddit_name_prefixed: z.string().optional(),
});

type RedditPostData = z.infer<typeof redditPostSchema>;

const redditResponseSchema = z.array(
  z.object({
    data: z.object({
      children: z.array(z.object({ data: redditPostSchema })).optional(),
    }),
  }),
);

interface RedditFetchResult {
  fetched: boolean;
  post?: RedditPostData;
}

const REDDIT_CACHE_TTL_MS = 60 * 1000; // 1 minute TTL to avoid stale data

interface RedditCacheEntry {
  expiresAt: number;
  promise: Promise<RedditFetchResult>;
}

const redditJsonCache = new Map<string, RedditCacheEntry>();

const purgeExpiredCacheEntries = (now: number) => {
  for (const [key, entry] of redditJsonCache.entries()) {
    if (entry.expiresAt <= now) {
      redditJsonCache.delete(key);
    }
  }
};

const decodeRedditUrl = (url?: string): string | undefined => {
  if (!url) {
    return undefined;
  }
  const decoded = decodeHtmlEntities(url);
  return decoded || undefined;
};

const buildJsonUrl = (url: string): string => {
  const urlObj = new URL(url);

  if (!urlObj.pathname.endsWith(".json")) {
    urlObj.pathname = urlObj.pathname.replace(/\/?$/, ".json");
  }

  return urlObj.toString();
};

const extractImageFromMediaMetadata = (
  media_metadata?: RedditPostData["media_metadata"],
): string | undefined => {
  if (!media_metadata) {
    return undefined;
  }
  const firstItem = Object.values(media_metadata)[0];
  if (!firstItem) {
    return undefined;
  }

  return (
    decodeRedditUrl(firstItem.s?.u) ??
    decodeRedditUrl(firstItem.p?.[0]?.u) ??
    undefined
  );
};

const isRedditImageHost = (urlCandidate: string): boolean => {
  try {
    const hostname = new URL(urlCandidate).hostname;
    return hostname.includes("redd.it");
  } catch {
    return false;
  }
};

const extractImageFromPost = (post: RedditPostData): string | undefined => {
  const previewImage = post.preview?.images?.[0];
  const previewUrl =
    decodeRedditUrl(previewImage?.source?.url) ??
    decodeRedditUrl(previewImage?.resolutions?.[0]?.url);
  if (previewUrl) {
    return previewUrl;
  }

  const mediaUrl = extractImageFromMediaMetadata(post.media_metadata);
  if (mediaUrl) {
    return mediaUrl;
  }

  const directUrl =
    decodeRedditUrl(post.url_overridden_by_dest) ??
    decodeRedditUrl(post.url) ??
    decodeRedditUrl(post.thumbnail);

  if (directUrl && isRedditImageHost(directUrl)) {
    return directUrl;
  }

  return undefined;
};

const extractTitleFromPost = (post: RedditPostData): string | undefined =>
  post.title?.trim() || undefined;

const extractAuthorFromPost = (post: RedditPostData): string | undefined =>
  post.author?.trim() || undefined;

const extractDateFromPost = (post: RedditPostData): string | undefined => {
  if (!post.created_utc) {
    return undefined;
  }
  const date = new Date(post.created_utc * 1000);
  return Number.isNaN(date.getTime()) ? undefined : date.toISOString();
};

const extractPublisherFromPost = (post: RedditPostData): string | undefined =>
  post.subreddit_name_prefixed?.trim() || "Reddit";

const REDDIT_LOGO_URL =
  "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png";

const fallbackDomImage = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
  // 'preview' subdomain images are more likely to be what we're after
  // but it could be in the 'i' subdomain.
  // returns undefined if neither exists
  const previewImages = htmlDom('img[src*="preview.redd.it"]')
    .map((_, el) => htmlDom(el).attr("src"))
    .get();
  const iImages = htmlDom('img[src*="i.redd.it"]')
    .map((_, el) => htmlDom(el).attr("src"))
    .get();
  return previewImages[0] || iImages[0];
};

const fallbackDomTitle = ({ htmlDom }: { htmlDom: CheerioAPI }) => {
  const title: string | undefined = htmlDom("shreddit-title[title]")
    .first()
    .attr("title");
  const postTitle: string | undefined =
    title ?? htmlDom("shreddit-post[post-title]").first().attr("post-title");
  return postTitle ? postTitle.trim() : undefined;
};

const fetchRedditPostData = async (url: string): Promise<RedditFetchResult> => {
  const cached = redditJsonCache.get(url);
  const now = Date.now();

  purgeExpiredCacheEntries(now);

  if (cached && cached.expiresAt > now) {
    return cached.promise;
  }

  const promise = (async () => {
    let jsonUrl: string;
    try {
      jsonUrl = buildJsonUrl(url);
    } catch (error) {
      logger.warn(
        "[MetascraperReddit] Failed to construct Reddit JSON URL",
        error,
      );
      return { fetched: false };
    }

    let response;
    try {
      response = await fetchWithProxy(jsonUrl, {
        headers: { accept: "application/json" },
      });
    } catch (error) {
      logger.warn(
        `[MetascraperReddit] Failed to fetch Reddit JSON for ${jsonUrl}`,
        error,
      );
      return { fetched: false };
    }

    if (response.status === 403) {
      // API forbidden; fall back to DOM scraping.
      return { fetched: false };
    }

    if (!response.ok) {
      logger.warn(
        `[MetascraperReddit] Reddit JSON request failed for ${jsonUrl} with status ${response.status}`,
      );
      return { fetched: false };
    }

    let payload: unknown;
    try {
      payload = await response.json();
    } catch (error) {
      logger.warn(
        `[MetascraperReddit] Failed to parse Reddit JSON for ${jsonUrl}`,
        error,
      );
      return { fetched: false };
    }

    const parsed = redditResponseSchema.safeParse(payload);
    if (!parsed.success) {
      logger.warn(
        "[MetascraperReddit] Reddit JSON schema validation failed",
        parsed.error,
      );
      return { fetched: false };
    }

    const firstListingWithChildren = parsed.data.find(
      (listing) => (listing.data.children?.length ?? 0) > 0,
    );

    return {
      fetched: true,
      post: firstListingWithChildren?.data.children?.[0]?.data,
    };
  })();

  redditJsonCache.set(url, {
    promise,
    expiresAt: now + REDDIT_CACHE_TTL_MS,
  });

  return promise;
};

const domainFromUrl = (url: string): string => {
  /**
   * First-party metascraper plugins import metascraper-helpers,
   * which exposes a parseUrl function from the tldtr package.
   * This function does similar to the 'domainWithoutSuffix'
   * field from the tldtr package, without requiring any
   * additional packages.
   **/
  try {
    // Create a URL instance to parse the hostname
    const hostname = new URL(url).hostname;
    const parts = hostname.split(".");
    // Return the part before the TLD (assuming at least two segments)
    // For example, "www.example.com" -> ["www", "example", "com"]
    if (parts.length >= 2) {
      return parts[parts.length - 2];
    }
    return hostname;
  } catch (error) {
    logger.error(
      "[MetascraperReddit] Test>domainFromUrl received an invalid URL:",
      error,
    );
    return "";
  }
};

const test = ({ url }: { url: string }): boolean =>
  domainFromUrl(url).toLowerCase() === "reddit";

const metascraperReddit = () => {
  const rules: Rules = {
    pkgName: "metascraper-reddit",
    test,
    image: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        const redditImage = extractImageFromPost(result.post);
        if (redditImage) {
          return redditImage;
        }
      }

      // If we successfully fetched JSON but found no Reddit image,
      // avoid falling back to random DOM images.
      if (result.fetched) {
        return undefined;
      }

      return fallbackDomImage({ htmlDom });
    }) as unknown as RulesOptions,
    title: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        const redditTitle = extractTitleFromPost(result.post);
        if (redditTitle) {
          return redditTitle;
        }
      }

      return fallbackDomTitle({ htmlDom });
    }) as unknown as RulesOptions,
    author: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return extractAuthorFromPost(result.post);
      }
      return undefined;
    }) as unknown as RulesOptions,
    datePublished: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return extractDateFromPost(result.post);
      }
      return undefined;
    }) as unknown as RulesOptions,
    publisher: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return extractPublisherFromPost(result.post);
      }
      return undefined;
    }) as unknown as RulesOptions,
    logo: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        return REDDIT_LOGO_URL;
      }
      return undefined;
    }) as unknown as RulesOptions,
    readableContentHtml: (async ({ url }: { url: string }) => {
      const result = await fetchRedditPostData(url);
      if (result.post) {
        const decoded = decodeHtmlEntities(result.post.selftext_html ?? "");
        // The post has no content, return the title
        return (decoded || result.post.title) ?? null;
      }
      return undefined;
    }) as unknown as RulesOptions,
  };

  return rules;
};

export default metascraperReddit;