diff options
| author | erik-nilcoast <138068205+erik-nilcoast@users.noreply.github.com> | 2025-03-22 17:38:50 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-03-22 22:38:50 +0000 |
| commit | b3417d87a0565b3536029e3b1e347611b5c6679b (patch) | |
| tree | f68d2faba080c41bb6c14d9e44ef68f6735f832e /apps/workers/crawlerWorker.ts | |
| parent | 13ba417f55ffd77bb5f4d4f4b53f349bd6d2dc4d (diff) | |
| download | karakeep-b3417d87a0565b3536029e3b1e347611b5c6679b.tar.zst | |
feat(workers): Adds publisher and author og:meta tags to Bookmark (#1141)
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 17dba443..5798b98c 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -14,9 +14,12 @@ import { JSDOM } from "jsdom"; import { DequeuedJob, Runner } from "liteque"; import metascraper from "metascraper"; import metascraperAmazon from "metascraper-amazon"; +import metascraperAuthor from "metascraper-author"; +import metascraperDate from "metascraper-date"; import metascraperDescription from "metascraper-description"; import metascraperImage from "metascraper-image"; import metascraperLogo from "metascraper-logo-favicon"; +import metascraperPublisher from "metascraper-publisher"; import metascraperReadability from "metascraper-readability"; import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; @@ -61,8 +64,14 @@ import { import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; const metascraperParser = metascraper([ + metascraperDate({ + dateModified: true, + datePublished: true, + }), metascraperAmazon(), metascraperReadability(), + metascraperAuthor(), + metascraperPublisher(), metascraperTitle(), metascraperDescription(), metascraperTwitter(), @@ -663,6 +672,17 @@ async function crawlAndParseUrl( } abortSignal.throwIfAborted(); + const parseDate = (date: string | undefined) => { + if (!date) { + return null; + } + try { + return new Date(date); + } catch (_e) { + return null; + } + }; + // TODO(important): Restrict the size of content to store await db.transaction(async (txn) => { await txn @@ -677,6 +697,10 @@ async function crawlAndParseUrl( htmlContent: readableContent?.content, crawledAt: new Date(), crawlStatusCode: statusCode, + author: meta.author, + publisher: meta.publisher, + datePublished: parseDate(meta.datePublished), + dateModified: parseDate(meta.dateModified), }) .where(eq(bookmarkLinks.id, bookmarkId)); |
