aboutsummaryrefslogtreecommitdiffstats
path: root/apps
diff options
context:
space:
mode:
Diffstat (limited to 'apps')
-rw-r--r--apps/workers/crawlerWorker.ts24
-rw-r--r--apps/workers/package.json5
-rw-r--r--apps/workers/searchWorker.ts4
3 files changed, 32 insertions, 1 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 17dba443..5798b98c 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -14,9 +14,12 @@ import { JSDOM } from "jsdom";
import { DequeuedJob, Runner } from "liteque";
import metascraper from "metascraper";
import metascraperAmazon from "metascraper-amazon";
+import metascraperAuthor from "metascraper-author";
+import metascraperDate from "metascraper-date";
import metascraperDescription from "metascraper-description";
import metascraperImage from "metascraper-image";
import metascraperLogo from "metascraper-logo-favicon";
+import metascraperPublisher from "metascraper-publisher";
import metascraperReadability from "metascraper-readability";
import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
@@ -61,8 +64,14 @@ import {
import { BookmarkTypes } from "@hoarder/shared/types/bookmarks";
const metascraperParser = metascraper([
+ metascraperDate({
+ dateModified: true,
+ datePublished: true,
+ }),
metascraperAmazon(),
metascraperReadability(),
+ metascraperAuthor(),
+ metascraperPublisher(),
metascraperTitle(),
metascraperDescription(),
metascraperTwitter(),
@@ -663,6 +672,17 @@ async function crawlAndParseUrl(
}
abortSignal.throwIfAborted();
+ const parseDate = (date: string | undefined) => {
+ if (!date) {
+ return null;
+ }
+ try {
+ return new Date(date);
+ } catch (_e) {
+ return null;
+ }
+ };
+
// TODO(important): Restrict the size of content to store
await db.transaction(async (txn) => {
await txn
@@ -677,6 +697,10 @@ async function crawlAndParseUrl(
htmlContent: readableContent?.content,
crawledAt: new Date(),
crawlStatusCode: statusCode,
+ author: meta.author,
+ publisher: meta.publisher,
+ datePublished: parseDate(meta.datePublished),
+ dateModified: parseDate(meta.dateModified),
})
.where(eq(bookmarkLinks.id, bookmarkId));
diff --git a/apps/workers/package.json b/apps/workers/package.json
index be700b36..86d781aa 100644
--- a/apps/workers/package.json
+++ b/apps/workers/package.json
@@ -18,12 +18,15 @@
"execa": "9.3.1",
"jsdom": "^24.0.0",
"liteque": "^0.3.2",
- "metascraper": "^5.45.24",
+ "metascraper": "^5.46.5",
"metascraper-amazon": "^5.45.22",
+ "metascraper-author": "5.46.5",
+ "metascraper-date": "^5.46.5",
"metascraper-description": "^5.45.22",
"metascraper-image": "^5.45.22",
"metascraper-logo": "^5.45.22",
"metascraper-logo-favicon": "^5.45.22",
+ "metascraper-publisher": "^5.46.5",
"metascraper-readability": "^5.45.22",
"metascraper-title": "^5.45.22",
"metascraper-twitter": "^5.45.6",
diff --git a/apps/workers/searchWorker.ts b/apps/workers/searchWorker.ts
index 1fbdbe73..7ad8b430 100644
--- a/apps/workers/searchWorker.ts
+++ b/apps/workers/searchWorker.ts
@@ -85,6 +85,10 @@ async function runIndex(
linkTitle: bookmark.link.title,
description: bookmark.link.description,
content: bookmark.link.content,
+ publisher: bookmark.link.publisher,
+ author: bookmark.link.author,
+ datePublished: bookmark.link.datePublished,
+ dateModified: bookmark.link.dateModified,
}
: undefined),
...(bookmark.asset