diff options
| author | MohamedBassem <me@mbassem.com> | 2024-02-06 18:16:35 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-02-06 19:24:52 +0000 |
| commit | baf48af5f0a4b88642edc18ae8b16e81260e1846 (patch) | |
| tree | 1f9779ac76b21ba7504ec664f05064d1b4e9ff2a /crawler/crawler.ts | |
| parent | e035c2fd1067a06d4774c64ae54548f664490f9d (diff) | |
| download | karakeep-baf48af5f0a4b88642edc18ae8b16e81260e1846.tar.zst | |
Implement metadata fetching logic in the crawler
Diffstat (limited to 'crawler/crawler.ts')
| -rw-r--r-- | crawler/crawler.ts | 70 |
1 files changed, 68 insertions, 2 deletions
diff --git a/crawler/crawler.ts b/crawler/crawler.ts index 58127331..c0f433af 100644 --- a/crawler/crawler.ts +++ b/crawler/crawler.ts @@ -1,6 +1,72 @@ import logger from "@remember/shared/logger"; +import { + ZCrawlLinkRequest, + zCrawlLinkRequestSchema, +} from "@remember/shared/queues"; import { Job } from "bullmq"; -export default async function runCrawler(job: Job) { - logger.info(`[Crawler] Got a new job: ${job.name}`); +import prisma from "@remember/db"; + +import metascraper from "metascraper"; + +const metascraperParser = metascraper([ + require("metascraper-description")(), + require("metascraper-image")(), + require("metascraper-logo-favicon")(), + require("metascraper-title")(), + require("metascraper-url")(), +]); + +export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { + const jobId = job.id || "unknown"; + + const request = zCrawlLinkRequestSchema.safeParse(job.data); + if (!request.success) { + logger.error( + `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, + ); + return; + } + + const { url, linkId } = request.data; + + logger.info( + `[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`, + ); + // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc) + + const resp = await fetch(url); + const respBody = await resp.text(); + + const meta = await metascraperParser({ + url, + html: respBody, + }); + + await prisma.bookmarkedLink.update({ + where: { + id: linkId, + }, + data: { + details: { + upsert: { + create: { + title: meta.title, + description: meta.description, + imageUrl: meta.image, + favicon: meta.logo, + }, + update: { + title: meta.title, + description: meta.description, + imageUrl: meta.image, + favicon: meta.logo, + }, + }, + }, + }, + include: { + details: true, + }, + }); } |
