From 3ec45e8bbb8285b17c703907d4c161b633663096 Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Wed, 7 Feb 2024 18:29:52 +0000 Subject: [refactor] Rename the crawlers package to workers --- Makefile | 4 +-- crawler/crawler.ts | 72 ---------------------------------------------------- crawler/index.ts | 32 ----------------------- crawler/package.json | 19 -------------- package.json | 2 +- workers/crawler.ts | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++ workers/index.ts | 32 +++++++++++++++++++++++ workers/package.json | 19 ++++++++++++++ 8 files changed, 126 insertions(+), 126 deletions(-) delete mode 100644 crawler/crawler.ts delete mode 100644 crawler/index.ts delete mode 100644 crawler/package.json create mode 100644 workers/crawler.ts create mode 100644 workers/index.ts create mode 100644 workers/package.json diff --git a/Makefile b/Makefile index c37d7541..9764c2b4 100644 --- a/Makefile +++ b/Makefile @@ -8,8 +8,8 @@ prisma: bunx prisma migrate dev; \ bunx prisma generate -worker: - cd crawler; \ +workers: + cd workers; \ bun --watch index.ts web: cd web; \ diff --git a/crawler/crawler.ts b/crawler/crawler.ts deleted file mode 100644 index c0f433af..00000000 --- a/crawler/crawler.ts +++ /dev/null @@ -1,72 +0,0 @@ -import logger from "@remember/shared/logger"; -import { - ZCrawlLinkRequest, - zCrawlLinkRequestSchema, -} from "@remember/shared/queues"; -import { Job } from "bullmq"; - -import prisma from "@remember/db"; - -import metascraper from "metascraper"; - -const metascraperParser = metascraper([ - require("metascraper-description")(), - require("metascraper-image")(), - require("metascraper-logo-favicon")(), - require("metascraper-title")(), - require("metascraper-url")(), -]); - -export default async function runCrawler(job: Job) { - const jobId = job.id || "unknown"; - - const request = zCrawlLinkRequestSchema.safeParse(job.data); - if (!request.success) { - logger.error( - `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, - ); - return; - } - - const { url, linkId } = request.data; - - logger.info( - `[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`, - ); - // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc) - - const resp = await fetch(url); - const respBody = await resp.text(); - - const meta = await metascraperParser({ - url, - html: respBody, - }); - - await prisma.bookmarkedLink.update({ - where: { - id: linkId, - }, - data: { - details: { - upsert: { - create: { - title: meta.title, - description: meta.description, - imageUrl: meta.image, - favicon: meta.logo, - }, - update: { - title: meta.title, - description: meta.description, - imageUrl: meta.image, - favicon: meta.logo, - }, - }, - }, - }, - include: { - details: true, - }, - }); -} diff --git a/crawler/index.ts b/crawler/index.ts deleted file mode 100644 index 76c6f03f..00000000 --- a/crawler/index.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { Worker } from "bullmq"; - -import { - LinkCrawlerQueue, - ZCrawlLinkRequest, - queueConnectionDetails, -} from "@remember/shared/queues"; -import logger from "@remember/shared/logger"; -import runCrawler from "./crawler"; - -logger.info("Starting crawler worker ..."); - -const crawlerWorker = new Worker( - LinkCrawlerQueue.name, - runCrawler, - { - connection: queueConnectionDetails, - autorun: false, - }, -); - -crawlerWorker.on("completed", (job) => { - const jobId = job?.id || "unknown"; - logger.info(`[Crawler][${jobId}] Completed successfully`); -}); - -crawlerWorker.on("failed", (job, error) => { - const jobId = job?.id || "unknown"; - logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`); -}); - -await Promise.all([crawlerWorker.run()]); diff --git a/crawler/package.json b/crawler/package.json deleted file mode 100644 index 9b590eb8..00000000 --- a/crawler/package.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "$schema": "https://json.schemastore.org/package.json", - "name": "@remember/crawler", - "version": "0.1.0", - "private": true, - "dependencies": { - "@remember/shared": "workspace:*", - "metascraper": "^5.43.4", - "metascraper-description": "^5.43.4", - "metascraper-image": "^5.43.4", - "metascraper-logo": "^5.43.4", - "metascraper-title": "^5.43.4", - "metascraper-url": "^5.43.4", - "metascraper-logo-favicon": "^5.43.4" - }, - "devDependencies": { - "@types/metascraper": "^5.14.3" - } -} diff --git a/package.json b/package.json index c57be20a..0462a4c7 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "private": true, "workspaces": [ "web", - "crawler", + "workers", "shared", "db" ], diff --git a/workers/crawler.ts b/workers/crawler.ts new file mode 100644 index 00000000..c0f433af --- /dev/null +++ b/workers/crawler.ts @@ -0,0 +1,72 @@ +import logger from "@remember/shared/logger"; +import { + ZCrawlLinkRequest, + zCrawlLinkRequestSchema, +} from "@remember/shared/queues"; +import { Job } from "bullmq"; + +import prisma from "@remember/db"; + +import metascraper from "metascraper"; + +const metascraperParser = metascraper([ + require("metascraper-description")(), + require("metascraper-image")(), + require("metascraper-logo-favicon")(), + require("metascraper-title")(), + require("metascraper-url")(), +]); + +export default async function runCrawler(job: Job) { + const jobId = job.id || "unknown"; + + const request = zCrawlLinkRequestSchema.safeParse(job.data); + if (!request.success) { + logger.error( + `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, + ); + return; + } + + const { url, linkId } = request.data; + + logger.info( + `[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`, + ); + // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc) + + const resp = await fetch(url); + const respBody = await resp.text(); + + const meta = await metascraperParser({ + url, + html: respBody, + }); + + await prisma.bookmarkedLink.update({ + where: { + id: linkId, + }, + data: { + details: { + upsert: { + create: { + title: meta.title, + description: meta.description, + imageUrl: meta.image, + favicon: meta.logo, + }, + update: { + title: meta.title, + description: meta.description, + imageUrl: meta.image, + favicon: meta.logo, + }, + }, + }, + }, + include: { + details: true, + }, + }); +} diff --git a/workers/index.ts b/workers/index.ts new file mode 100644 index 00000000..76c6f03f --- /dev/null +++ b/workers/index.ts @@ -0,0 +1,32 @@ +import { Worker } from "bullmq"; + +import { + LinkCrawlerQueue, + ZCrawlLinkRequest, + queueConnectionDetails, +} from "@remember/shared/queues"; +import logger from "@remember/shared/logger"; +import runCrawler from "./crawler"; + +logger.info("Starting crawler worker ..."); + +const crawlerWorker = new Worker( + LinkCrawlerQueue.name, + runCrawler, + { + connection: queueConnectionDetails, + autorun: false, + }, +); + +crawlerWorker.on("completed", (job) => { + const jobId = job?.id || "unknown"; + logger.info(`[Crawler][${jobId}] Completed successfully`); +}); + +crawlerWorker.on("failed", (job, error) => { + const jobId = job?.id || "unknown"; + logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`); +}); + +await Promise.all([crawlerWorker.run()]); diff --git a/workers/package.json b/workers/package.json new file mode 100644 index 00000000..950233ab --- /dev/null +++ b/workers/package.json @@ -0,0 +1,19 @@ +{ + "$schema": "https://json.schemastore.org/package.json", + "name": "@remember/workers", + "version": "0.1.0", + "private": true, + "dependencies": { + "@remember/shared": "workspace:*", + "metascraper": "^5.43.4", + "metascraper-description": "^5.43.4", + "metascraper-image": "^5.43.4", + "metascraper-logo": "^5.43.4", + "metascraper-title": "^5.43.4", + "metascraper-url": "^5.43.4", + "metascraper-logo-favicon": "^5.43.4" + }, + "devDependencies": { + "@types/metascraper": "^5.14.3" + } +} -- cgit v1.2.3-70-g09d2