aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
authorRandall Hand <Yeraze@users.noreply.github.com>2025-12-14 06:15:53 -0500
committerGitHub <noreply@github.com>2025-12-14 11:15:53 +0000
commitb3196354d1272e6440119ba170c9d8369149baa4 (patch)
treea738bd10d164da69aeb471b7e0c551e86a94cea9 /apps
parentf5c32d940ea6c2a6da6c225c2de1eba13b49f9e0 (diff)
downloadkarakeep-b3196354d1272e6440119ba170c9d8369149baa4.tar.zst
fix: Fix Amazon product image extraction on amazon.com URLs (#2108)
The metascraper-amazon package extracts the first .a-dynamic-image element, which on amazon.com is often the Prime logo instead of the product image. This works fine on amazon.co.uk where the product image appears first in the DOM. Created a custom metascraper plugin that uses more specific selectors (#landingImage, #imgTagWrapperId, #imageBlock) to target the actual product image. By placing this plugin before metascraperAmazon() in the chain, we fix image extraction while preserving all other Amazon metadata (title, brand, description). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/metascraper-plugins/metascraper-amazon-improved.ts77
-rw-r--r--apps/workers/workers/crawlerWorker.ts2
2 files changed, 79 insertions, 0 deletions
diff --git a/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts b/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts
new file mode 100644
index 00000000..ea9bf2e9
--- /dev/null
+++ b/apps/workers/metascraper-plugins/metascraper-amazon-improved.ts
@@ -0,0 +1,77 @@
+import type { Rules } from "metascraper";
+
+/**
+ * Improved Amazon metascraper plugin that fixes image extraction.
+ *
+ * The default metascraper-amazon package uses `.a-dynamic-image` selector
+ * which matches the FIRST element with that class. On amazon.com pages,
+ * this is often the Prime logo instead of the product image.
+ *
+ * This plugin uses more specific selectors to target the actual product
+ * image:
+ * - #landingImage: The main product image ID
+ * - #imgTagWrapperId img: Fallback container for product images
+ * - #imageBlock img: Additional fallback for newer Amazon layouts
+ *
+ * By placing this plugin BEFORE metascraperAmazon() in the plugin chain,
+ * we ensure the correct image is extracted while keeping all other Amazon
+ * metadata (title, brand, description) from the original plugin.
+ */
+
+const REGEX_AMAZON_URL =
+ /https?:\/\/(.*amazon\..*\/.*|.*amzn\..*\/.*|.*a\.co\/.*)/i;
+
+const test = ({ url }: { url: string }): boolean => REGEX_AMAZON_URL.test(url);
+
+const metascraperAmazonImproved = () => {
+ const rules: Rules = {
+ pkgName: "metascraper-amazon-improved",
+ test,
+ image: ({ htmlDom }) => {
+ // Try the main product image ID first (most reliable)
+ // Prefer data-old-hires attribute for high-resolution images
+ const landingImageHires = htmlDom("#landingImage").attr("data-old-hires");
+ if (landingImageHires) {
+ return landingImageHires;
+ }
+
+ const landingImageSrc = htmlDom("#landingImage").attr("src");
+ if (landingImageSrc) {
+ return landingImageSrc;
+ }
+
+ // Fallback to image block container
+ const imgTagHires = htmlDom("#imgTagWrapperId img").attr(
+ "data-old-hires",
+ );
+ if (imgTagHires) {
+ return imgTagHires;
+ }
+
+ const imgTagSrc = htmlDom("#imgTagWrapperId img").attr("src");
+ if (imgTagSrc) {
+ return imgTagSrc;
+ }
+
+ // Additional fallback for newer Amazon layouts
+ const imageBlockHires = htmlDom("#imageBlock img")
+ .first()
+ .attr("data-old-hires");
+ if (imageBlockHires) {
+ return imageBlockHires;
+ }
+
+ const imageBlockSrc = htmlDom("#imageBlock img").first().attr("src");
+ if (imageBlockSrc) {
+ return imageBlockSrc;
+ }
+
+ // Return undefined to allow next plugin to try
+ return undefined;
+ },
+ };
+
+ return rules;
+};
+
+export default metascraperAmazonImproved;
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 2c96338a..aedf4aa0 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -82,6 +82,7 @@ import { getRateLimitClient } from "@karakeep/shared/ratelimiting";
import { tryCatch } from "@karakeep/shared/tryCatch";
import { BookmarkTypes } from "@karakeep/shared/types/bookmarks";
+import metascraperAmazonImproved from "../metascraper-plugins/metascraper-amazon-improved";
import metascraperReddit from "../metascraper-plugins/metascraper-reddit";
function abortPromise(signal: AbortSignal): Promise<never> {
@@ -125,6 +126,7 @@ const metascraperParser = metascraper([
dateModified: true,
datePublished: true,
}),
+ metascraperAmazonImproved(), // Fix image extraction bug - must come before metascraperAmazon()
metascraperAmazon(),
metascraperYoutube({
gotOpts: {