diff options
| author | Mohamed Bassem <me@mbassem.com> | 2025-09-07 10:00:53 +0000 |
|---|---|---|
| committer | Mohamed Bassem <me@mbassem.com> | 2025-09-07 10:00:53 +0000 |
| commit | 517e0c105b159266e5ac7503d1124e44363b8828 (patch) | |
| tree | eeeaf10d3160ce9b917c47fb7dc20821c55c80f9 | |
| parent | 03f10c751a4ba577b6a4b0b1bf03b86a7bff1d5b (diff) | |
| download | karakeep-517e0c105b159266e5ac7503d1124e44363b8828.tar.zst | |
fix: fix pdf detection when the header contains charset. fix: #1677
| -rw-r--r-- | apps/workers/workers/crawlerWorker.ts | 18 |
1 files changed, 16 insertions, 2 deletions
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 625c92d9..e011b826 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -67,6 +67,17 @@ import { BookmarkTypes } from "@karakeep/shared/types/bookmarks"; import metascraperReddit from "../metascraper-plugins/metascraper-reddit"; +/** + * Normalize a Content-Type header by stripping parameters (e.g., charset) + * and lowercasing the media type, so comparisons against supported types work. + */ +function normalizeContentType(header: string | null): string | null { + if (!header) { + return null; + } + return header.split(";", 1)[0]!.trim().toLowerCase(); +} + const metascraperParser = metascraper([ metascraperDate({ dateModified: true, @@ -531,7 +542,9 @@ async function downloadAndStoreFile( const buffer = await response.arrayBuffer(); const assetId = newAssetId(); - const contentType = response.headers.get("content-type"); + const contentType = normalizeContentType( + response.headers.get("content-type"), + ); if (!contentType) { throw new Error("No content type in the response"); } @@ -662,7 +675,8 @@ async function getContentType( method: "HEAD", signal: AbortSignal.any([AbortSignal.timeout(5000), abortSignal]), }); - const contentType = response.headers.get("content-type"); + const rawContentType = response.headers.get("content-type"); + const contentType = normalizeContentType(rawContentType); logger.info( `[Crawler][${jobId}] Content-type for the url ${url} is "${contentType}"`, ); |
