aboutsummaryrefslogtreecommitdiffstats
path: root/crawler
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-02-06 18:16:35 +0000
committerMohamedBassem <me@mbassem.com>2024-02-06 19:24:52 +0000
commitbaf48af5f0a4b88642edc18ae8b16e81260e1846 (patch)
tree1f9779ac76b21ba7504ec664f05064d1b4e9ff2a /crawler
parente035c2fd1067a06d4774c64ae54548f664490f9d (diff)
downloadkarakeep-baf48af5f0a4b88642edc18ae8b16e81260e1846.tar.zst
Implement metadata fetching logic in the crawler
Diffstat (limited to 'crawler')
-rw-r--r--crawler/crawler.ts70
-rw-r--r--crawler/index.ts32
-rw-r--r--crawler/main.ts17
-rw-r--r--crawler/package.json13
4 files changed, 112 insertions, 20 deletions
diff --git a/crawler/crawler.ts b/crawler/crawler.ts
index 58127331..c0f433af 100644
--- a/crawler/crawler.ts
+++ b/crawler/crawler.ts
@@ -1,6 +1,72 @@
import logger from "@remember/shared/logger";
+import {
+ ZCrawlLinkRequest,
+ zCrawlLinkRequestSchema,
+} from "@remember/shared/queues";
import { Job } from "bullmq";
-export default async function runCrawler(job: Job) {
- logger.info(`[Crawler] Got a new job: ${job.name}`);
+import prisma from "@remember/db";
+
+import metascraper from "metascraper";
+
+const metascraperParser = metascraper([
+ require("metascraper-description")(),
+ require("metascraper-image")(),
+ require("metascraper-logo-favicon")(),
+ require("metascraper-title")(),
+ require("metascraper-url")(),
+]);
+
+export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
+ const jobId = job.id || "unknown";
+
+ const request = zCrawlLinkRequestSchema.safeParse(job.data);
+ if (!request.success) {
+ logger.error(
+ `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
+ );
+ return;
+ }
+
+ const { url, linkId } = request.data;
+
+ logger.info(
+ `[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`,
+ );
+ // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)
+
+ const resp = await fetch(url);
+ const respBody = await resp.text();
+
+ const meta = await metascraperParser({
+ url,
+ html: respBody,
+ });
+
+ await prisma.bookmarkedLink.update({
+ where: {
+ id: linkId,
+ },
+ data: {
+ details: {
+ upsert: {
+ create: {
+ title: meta.title,
+ description: meta.description,
+ imageUrl: meta.image,
+ favicon: meta.logo,
+ },
+ update: {
+ title: meta.title,
+ description: meta.description,
+ imageUrl: meta.image,
+ favicon: meta.logo,
+ },
+ },
+ },
+ },
+ include: {
+ details: true,
+ },
+ });
}
diff --git a/crawler/index.ts b/crawler/index.ts
new file mode 100644
index 00000000..76c6f03f
--- /dev/null
+++ b/crawler/index.ts
@@ -0,0 +1,32 @@
+import { Worker } from "bullmq";
+
+import {
+ LinkCrawlerQueue,
+ ZCrawlLinkRequest,
+ queueConnectionDetails,
+} from "@remember/shared/queues";
+import logger from "@remember/shared/logger";
+import runCrawler from "./crawler";
+
+logger.info("Starting crawler worker ...");
+
+const crawlerWorker = new Worker<ZCrawlLinkRequest, void>(
+ LinkCrawlerQueue.name,
+ runCrawler,
+ {
+ connection: queueConnectionDetails,
+ autorun: false,
+ },
+);
+
+crawlerWorker.on("completed", (job) => {
+ const jobId = job?.id || "unknown";
+ logger.info(`[Crawler][${jobId}] Completed successfully`);
+});
+
+crawlerWorker.on("failed", (job, error) => {
+ const jobId = job?.id || "unknown";
+ logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
+});
+
+await Promise.all([crawlerWorker.run()]);
diff --git a/crawler/main.ts b/crawler/main.ts
deleted file mode 100644
index 7d1c0f11..00000000
--- a/crawler/main.ts
+++ /dev/null
@@ -1,17 +0,0 @@
-import { Worker } from "bullmq";
-
-import {
- LinkCrawlerQueue,
- queueConnectionDetails,
-} from "@remember/shared/queues";
-import logger from "@remember/shared/logger";
-import runCrawler from "./crawler";
-
-logger.info("Starting crawler worker ...");
-
-const crawlerWorker = new Worker(LinkCrawlerQueue.name, runCrawler, {
- connection: queueConnectionDetails,
- autorun: false,
-});
-
-await Promise.all([crawlerWorker]);
diff --git a/crawler/package.json b/crawler/package.json
index 67e38cff..9b590eb8 100644
--- a/crawler/package.json
+++ b/crawler/package.json
@@ -1,8 +1,19 @@
{
+ "$schema": "https://json.schemastore.org/package.json",
"name": "@remember/crawler",
"version": "0.1.0",
"private": true,
"dependencies": {
- "@remember/shared": "workspace:*"
+ "@remember/shared": "workspace:*",
+ "metascraper": "^5.43.4",
+ "metascraper-description": "^5.43.4",
+ "metascraper-image": "^5.43.4",
+ "metascraper-logo": "^5.43.4",
+ "metascraper-title": "^5.43.4",
+ "metascraper-url": "^5.43.4",
+ "metascraper-logo-favicon": "^5.43.4"
+ },
+ "devDependencies": {
+ "@types/metascraper": "^5.14.3"
}
}