aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts
blob: ea9bf2e90bbff5a760411c426785b25d52215245 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import type { Rules } from "metascraper";

/**
 * Improved Amazon metascraper plugin that fixes image extraction.
 *
 * The default metascraper-amazon package uses `.a-dynamic-image` selector
 * which matches the FIRST element with that class. On amazon.com pages,
 * this is often the Prime logo instead of the product image.
 *
 * This plugin uses more specific selectors to target the actual product
 * image:
 * - #landingImage: The main product image ID
 * - #imgTagWrapperId img: Fallback container for product images
 * - #imageBlock img: Additional fallback for newer Amazon layouts
 *
 * By placing this plugin BEFORE metascraperAmazon() in the plugin chain,
 * we ensure the correct image is extracted while keeping all other Amazon
 * metadata (title, brand, description) from the original plugin.
 */

const REGEX_AMAZON_URL =
  /https?:\/\/(.*amazon\..*\/.*|.*amzn\..*\/.*|.*a\.co\/.*)/i;

const test = ({ url }: { url: string }): boolean => REGEX_AMAZON_URL.test(url);

const metascraperAmazonImproved = () => {
  const rules: Rules = {
    pkgName: "metascraper-amazon-improved",
    test,
    image: ({ htmlDom }) => {
      // Try the main product image ID first (most reliable)
      // Prefer data-old-hires attribute for high-resolution images
      const landingImageHires = htmlDom("#landingImage").attr("data-old-hires");
      if (landingImageHires) {
        return landingImageHires;
      }

      const landingImageSrc = htmlDom("#landingImage").attr("src");
      if (landingImageSrc) {
        return landingImageSrc;
      }

      // Fallback to image block container
      const imgTagHires = htmlDom("#imgTagWrapperId img").attr(
        "data-old-hires",
      );
      if (imgTagHires) {
        return imgTagHires;
      }

      const imgTagSrc = htmlDom("#imgTagWrapperId img").attr("src");
      if (imgTagSrc) {
        return imgTagSrc;
      }

      // Additional fallback for newer Amazon layouts
      const imageBlockHires = htmlDom("#imageBlock img")
        .first()
        .attr("data-old-hires");
      if (imageBlockHires) {
        return imageBlockHires;
      }

      const imageBlockSrc = htmlDom("#imageBlock img").first().attr("src");
      if (imageBlockSrc) {
        return imageBlockSrc;
      }

      // Return undefined to allow next plugin to try
      return undefined;
    },
  };

  return rules;
};

export default metascraperAmazonImproved;