diff options
| author | erik-nilcoast <138068205+erik-nilcoast@users.noreply.github.com> | 2025-03-22 17:38:50 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-03-22 22:38:50 +0000 |
| commit | b3417d87a0565b3536029e3b1e347611b5c6679b (patch) | |
| tree | f68d2faba080c41bb6c14d9e44ef68f6735f832e /apps | |
| parent | 13ba417f55ffd77bb5f4d4f4b53f349bd6d2dc4d (diff) | |
| download | karakeep-b3417d87a0565b3536029e3b1e347611b5c6679b.tar.zst | |
feat(workers): Adds publisher and author og:meta tags to Bookmark (#1141)
Diffstat (limited to 'apps')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 24 | ||||
| -rw-r--r-- | apps/workers/package.json | 5 | ||||
| -rw-r--r-- | apps/workers/searchWorker.ts | 4 |
3 files changed, 32 insertions, 1 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 17dba443..5798b98c 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -14,9 +14,12 @@ import { JSDOM } from "jsdom"; import { DequeuedJob, Runner } from "liteque"; import metascraper from "metascraper"; import metascraperAmazon from "metascraper-amazon"; +import metascraperAuthor from "metascraper-author"; +import metascraperDate from "metascraper-date"; import metascraperDescription from "metascraper-description"; import metascraperImage from "metascraper-image"; import metascraperLogo from "metascraper-logo-favicon"; +import metascraperPublisher from "metascraper-publisher"; import metascraperReadability from "metascraper-readability"; import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; @@ -61,8 +64,14 @@ import { import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; const metascraperParser = metascraper([ + metascraperDate({ + dateModified: true, + datePublished: true, + }), metascraperAmazon(), metascraperReadability(), + metascraperAuthor(), + metascraperPublisher(), metascraperTitle(), metascraperDescription(), metascraperTwitter(), @@ -663,6 +672,17 @@ async function crawlAndParseUrl( } abortSignal.throwIfAborted(); + const parseDate = (date: string | undefined) => { + if (!date) { + return null; + } + try { + return new Date(date); + } catch (_e) { + return null; + } + }; + // TODO(important): Restrict the size of content to store await db.transaction(async (txn) => { await txn @@ -677,6 +697,10 @@ async function crawlAndParseUrl( htmlContent: readableContent?.content, crawledAt: new Date(), crawlStatusCode: statusCode, + author: meta.author, + publisher: meta.publisher, + datePublished: parseDate(meta.datePublished), + dateModified: parseDate(meta.dateModified), }) .where(eq(bookmarkLinks.id, bookmarkId)); diff --git a/apps/workers/package.json b/apps/workers/package.json index be700b36..86d781aa 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -18,12 +18,15 @@ "execa": "9.3.1", "jsdom": "^24.0.0", "liteque": "^0.3.2", - "metascraper": "^5.45.24", + "metascraper": "^5.46.5", "metascraper-amazon": "^5.45.22", + "metascraper-author": "5.46.5", + "metascraper-date": "^5.46.5", "metascraper-description": "^5.45.22", "metascraper-image": "^5.45.22", "metascraper-logo": "^5.45.22", "metascraper-logo-favicon": "^5.45.22", + "metascraper-publisher": "^5.46.5", "metascraper-readability": "^5.45.22", "metascraper-title": "^5.45.22", "metascraper-twitter": "^5.45.6", diff --git a/apps/workers/searchWorker.ts b/apps/workers/searchWorker.ts index 1fbdbe73..7ad8b430 100644 --- a/apps/workers/searchWorker.ts +++ b/apps/workers/searchWorker.ts @@ -85,6 +85,10 @@ async function runIndex( linkTitle: bookmark.link.title, description: bookmark.link.description, content: bookmark.link.content, + publisher: bookmark.link.publisher, + author: bookmark.link.author, + datePublished: bookmark.link.datePublished, + dateModified: bookmark.link.dateModified, } : undefined), ...(bookmark.asset |
