aboutsummaryrefslogtreecommitdiffstats
path: root/crawler/crawler.ts
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-06 18:16:35 +0000
committerMohamedBassem <me@mbassem.com>2024-02-06 19:24:52 +0000
commitbaf48af5f0a4b88642edc18ae8b16e81260e1846 (patch)
tree1f9779ac76b21ba7504ec664f05064d1b4e9ff2a /crawler/crawler.ts
parente035c2fd1067a06d4774c64ae54548f664490f9d (diff)
downloadkarakeep-baf48af5f0a4b88642edc18ae8b16e81260e1846.tar.zst
Implement metadata fetching logic in the crawler
Diffstat (limited to 'crawler/crawler.ts')
-rw-r--r--crawler/crawler.ts70
1 files changed, 68 insertions, 2 deletions
diff --git a/crawler/crawler.ts b/crawler/crawler.ts
index 58127331..c0f433af 100644
--- a/crawler/crawler.ts
+++ b/crawler/crawler.ts
@@ -1,6 +1,72 @@
import logger from "@remember/shared/logger";
+import {
+ ZCrawlLinkRequest,
+ zCrawlLinkRequestSchema,
+} from "@remember/shared/queues";
import { Job } from "bullmq";
-export default async function runCrawler(job: Job) {
- logger.info(`[Crawler] Got a new job: ${job.name}`);
+import prisma from "@remember/db";
+
+import metascraper from "metascraper";
+
+const metascraperParser = metascraper([
+ require("metascraper-description")(),
+ require("metascraper-image")(),
+ require("metascraper-logo-favicon")(),
+ require("metascraper-title")(),
+ require("metascraper-url")(),
+]);
+
+export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
+ const jobId = job.id || "unknown";
+
+ const request = zCrawlLinkRequestSchema.safeParse(job.data);
+ if (!request.success) {
+ logger.error(
+ `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+ );
+ return;
+ }
+
+ const { url, linkId } = request.data;
+
+ logger.info(
+ `[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`,
+ );
+ // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)
+
+ const resp = await fetch(url);
+ const respBody = await resp.text();
+
+ const meta = await metascraperParser({
+ url,
+ html: respBody,
+ });
+
+ await prisma.bookmarkedLink.update({
+ where: {
+ id: linkId,
+ },
+ data: {
+ details: {
+ upsert: {
+ create: {
+ title: meta.title,
+ description: meta.description,
+ imageUrl: meta.image,
+ favicon: meta.logo,
+ },
+ update: {
+ title: meta.title,
+ description: meta.description,
+ imageUrl: meta.image,
+ favicon: meta.logo,
+ },
+ },
+ },
+ },
+ include: {
+ details: true,
+ },
+ });
}