From b3196354d1272e6440119ba170c9d8369149baa4 Mon Sep 17 00:00:00 2001 From: Randall Hand Date: Sun, 14 Dec 2025 06:15:53 -0500 Subject: fix: Fix Amazon product image extraction on amazon.com URLs (#2108) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The metascraper-amazon package extracts the first .a-dynamic-image element, which on amazon.com is often the Prime logo instead of the product image. This works fine on amazon.co.uk where the product image appears first in the DOM. Created a custom metascraper plugin that uses more specific selectors (#landingImage, #imgTagWrapperId, #imageBlock) to target the actual product image. By placing this plugin before metascraperAmazon() in the chain, we fix image extraction while preserving all other Amazon metadata (title, brand, description). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude --- .../metascraper-amazon-improved.ts | 77 ++++++++++++++++++++++ apps/workers/workers/crawlerWorker.ts | 2 + 2 files changed, 79 insertions(+) create mode 100644 apps/workers/metascraper-plugins/metascraper-amazon-improved.ts (limited to 'apps') diff --git a/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts b/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts new file mode 100644 index 00000000..ea9bf2e9 --- /dev/null +++ b/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts @@ -0,0 +1,77 @@ +import type { Rules } from "metascraper"; + +/** + * Improved Amazon metascraper plugin that fixes image extraction. + * + * The default metascraper-amazon package uses `.a-dynamic-image` selector + * which matches the FIRST element with that class. On amazon.com pages, + * this is often the Prime logo instead of the product image. + * + * This plugin uses more specific selectors to target the actual product + * image: + * - #landingImage: The main product image ID + * - #imgTagWrapperId img: Fallback container for product images + * - #imageBlock img: Additional fallback for newer Amazon layouts + * + * By placing this plugin BEFORE metascraperAmazon() in the plugin chain, + * we ensure the correct image is extracted while keeping all other Amazon + * metadata (title, brand, description) from the original plugin. + */ + +const REGEX_AMAZON_URL = + /https?:\/\/(.*amazon\..*\/.*|.*amzn\..*\/.*|.*a\.co\/.*)/i; + +const test = ({ url }: { url: string }): boolean => REGEX_AMAZON_URL.test(url); + +const metascraperAmazonImproved = () => { + const rules: Rules = { + pkgName: "metascraper-amazon-improved", + test, + image: ({ htmlDom }) => { + // Try the main product image ID first (most reliable) + // Prefer data-old-hires attribute for high-resolution images + const landingImageHires = htmlDom("#landingImage").attr("data-old-hires"); + if (landingImageHires) { + return landingImageHires; + } + + const landingImageSrc = htmlDom("#landingImage").attr("src"); + if (landingImageSrc) { + return landingImageSrc; + } + + // Fallback to image block container + const imgTagHires = htmlDom("#imgTagWrapperId img").attr( + "data-old-hires", + ); + if (imgTagHires) { + return imgTagHires; + } + + const imgTagSrc = htmlDom("#imgTagWrapperId img").attr("src"); + if (imgTagSrc) { + return imgTagSrc; + } + + // Additional fallback for newer Amazon layouts + const imageBlockHires = htmlDom("#imageBlock img") + .first() + .attr("data-old-hires"); + if (imageBlockHires) { + return imageBlockHires; + } + + const imageBlockSrc = htmlDom("#imageBlock img").first().attr("src"); + if (imageBlockSrc) { + return imageBlockSrc; + } + + // Return undefined to allow next plugin to try + return undefined; + }, + }; + + return rules; +}; + +export default metascraperAmazonImproved; diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 2c96338a..aedf4aa0 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -82,6 +82,7 @@ import { getRateLimitClient } from "@karakeep/shared/ratelimiting"; import { tryCatch } from "@karakeep/shared/tryCatch"; import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; +import metascraperAmazonImproved from "../metascraper-plugins/metascraper-amazon-improved"; import metascraperReddit from "../metascraper-plugins/metascraper-reddit"; function abortPromise(signal: AbortSignal): Promise { @@ -125,6 +126,7 @@ const metascraperParser = metascraper([ dateModified: true, datePublished: true, }), + metascraperAmazonImproved(), // Fix image extraction bug - must come before metascraperAmazon() metascraperAmazon(), metascraperYoutube({ gotOpts: { -- cgit v1.2.3-70-g09d2