From b3417d87a0565b3536029e3b1e347611b5c6679b Mon Sep 17 00:00:00 2001 From: erik-nilcoast <138068205+erik-nilcoast@users.noreply.github.com> Date: Sat, 22 Mar 2025 17:38:50 -0500 Subject: feat(workers): Adds publisher and author og:meta tags to Bookmark (#1141) --- apps/workers/crawlerWorker.ts | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'apps/workers/crawlerWorker.ts') diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 17dba443..5798b98c 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -14,9 +14,12 @@ import { JSDOM } from "jsdom"; import { DequeuedJob, Runner } from "liteque"; import metascraper from "metascraper"; import metascraperAmazon from "metascraper-amazon"; +import metascraperAuthor from "metascraper-author"; +import metascraperDate from "metascraper-date"; import metascraperDescription from "metascraper-description"; import metascraperImage from "metascraper-image"; import metascraperLogo from "metascraper-logo-favicon"; +import metascraperPublisher from "metascraper-publisher"; import metascraperReadability from "metascraper-readability"; import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; @@ -61,8 +64,14 @@ import { import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; const metascraperParser = metascraper([ + metascraperDate({ + dateModified: true, + datePublished: true, + }), metascraperAmazon(), metascraperReadability(), + metascraperAuthor(), + metascraperPublisher(), metascraperTitle(), metascraperDescription(), metascraperTwitter(), @@ -663,6 +672,17 @@ async function crawlAndParseUrl( } abortSignal.throwIfAborted(); + const parseDate = (date: string | undefined) => { + if (!date) { + return null; + } + try { + return new Date(date); + } catch (_e) { + return null; + } + }; + // TODO(important): Restrict the size of content to store await db.transaction(async (txn) => { await txn @@ -677,6 +697,10 @@ async function crawlAndParseUrl( htmlContent: readableContent?.content, crawledAt: new Date(), crawlStatusCode: statusCode, + author: meta.author, + publisher: meta.publisher, + datePublished: parseDate(meta.datePublished), + dateModified: parseDate(meta.dateModified), }) .where(eq(bookmarkLinks.id, bookmarkId)); -- cgit v1.2.3-70-g09d2