aboutsummaryrefslogtreecommitdiffstats
path: root/workers/crawler.ts
blob: 817bba4520ce41b7c39455867696c39ac300a82e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import logger from "@remember/shared/logger";
import {
  OpenAIQueue,
  ZCrawlLinkRequest,
  zCrawlLinkRequestSchema,
} from "@remember/shared/queues";
import { Job } from "bullmq";

import prisma from "@remember/db";

import metascraper from "metascraper";

const metascraperParser = metascraper([
  require("metascraper-description")(),
  require("metascraper-image")(),
  require("metascraper-logo-favicon")(),
  require("metascraper-title")(),
  require("metascraper-url")(),
]);

export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
  const jobId = job.id || "unknown";

  const request = zCrawlLinkRequestSchema.safeParse(job.data);
  if (!request.success) {
    logger.error(
      `[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
    );
    return;
  }

  const { url, linkId } = request.data;

  logger.info(
    `[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`,
  );
  // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)

  const resp = await fetch(url);
  const respBody = await resp.text();

  const meta = await metascraperParser({
    url,
    html: respBody,
  });

  await prisma.bookmarkedLink.update({
    where: {
      id: linkId,
    },
    data: {
      details: {
        upsert: {
          create: {
            title: meta.title,
            description: meta.description,
            imageUrl: meta.image,
            favicon: meta.logo,
          },
          update: {
            title: meta.title,
            description: meta.description,
            imageUrl: meta.image,
            favicon: meta.logo,
          },
        },
      },
    },
    include: {
      details: true,
    },
  });

  // Enqueue openai job
  OpenAIQueue.add("openai", {
    linkId,
  });
}