aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamed Bassem <me@mbassem.com>2025-09-07 10:00:53 +0000
committerMohamed Bassem <me@mbassem.com>2025-09-07 10:00:53 +0000
commit517e0c105b159266e5ac7503d1124e44363b8828 (patch)
treeeeeaf10d3160ce9b917c47fb7dc20821c55c80f9
parent03f10c751a4ba577b6a4b0b1bf03b86a7bff1d5b (diff)
downloadkarakeep-517e0c105b159266e5ac7503d1124e44363b8828.tar.zst
fix: fix pdf detection when the header contains charset. fix: #1677
-rw-r--r--apps/workers/workers/crawlerWorker.ts18
1 files changed, 16 insertions, 2 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 625c92d9..e011b826 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -67,6 +67,17 @@ import { BookmarkTypes } from "@karakeep/shared/types/bookmarks";
import metascraperReddit from "../metascraper-plugins/metascraper-reddit";
+/**
+ * Normalize a Content-Type header by stripping parameters (e.g., charset)
+ * and lowercasing the media type, so comparisons against supported types work.
+ */
+function normalizeContentType(header: string | null): string | null {
+ if (!header) {
+ return null;
+ }
+ return header.split(";", 1)[0]!.trim().toLowerCase();
+}
+
const metascraperParser = metascraper([
metascraperDate({
dateModified: true,
@@ -531,7 +542,9 @@ async function downloadAndStoreFile(
const buffer = await response.arrayBuffer();
const assetId = newAssetId();
- const contentType = response.headers.get("content-type");
+ const contentType = normalizeContentType(
+ response.headers.get("content-type"),
+ );
if (!contentType) {
throw new Error("No content type in the response");
}
@@ -662,7 +675,8 @@ async function getContentType(
method: "HEAD",
signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]),
});
- const contentType = response.headers.get("content-type");
+ const rawContentType = response.headers.get("content-type");
+ const contentType = normalizeContentType(rawContentType);
logger.info(
`[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`,
);