aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
authorMohamedBassem <me@mbassem.com>2024-03-21 02:35:39 +0000
committerMohamedBassem <me@mbassem.com>2024-03-21 02:35:39 +0000
commit71707500f6e59b61f0a97fac7b2caaa0af4e3168 (patch)
treeed29b64620bdb1d5ba8194ff19903ae4236abf56 /apps/workers/crawlerWorker.ts
parent7d7d3754d33b41478fea2d2d7ed902d665a9e03d (diff)
downloadkarakeep-71707500f6e59b61f0a97fac7b2caaa0af4e3168.tar.zst
fix: Simple validations for crawled URLs
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts18
1 files changed, 17 insertions, 1 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 5db2da7b..ecd8d146 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -113,6 +113,22 @@ async function getBookmarkUrl(bookmarkId: string) {
return bookmark.url;
}
+/**
+ * This provides some "basic" protection from malicious URLs. However, all of those
+ * can be easily circumvented by pointing dns of origin to localhost, or with
+ * redirects.
+ */
+function validateUrl(url: string) {
+ const urlParsed = new URL(url);
+ if (urlParsed.protocol != "http:" && urlParsed.protocol != "https:") {
+ throw new Error(`Unsupported URL protocol: ${urlParsed.protocol}`);
+ }
+
+ if (["localhost", "127.0.0.1", "0.0.0.0"].includes(urlParsed.hostname)) {
+ throw new Error(`Link hostname rejected: ${urlParsed.hostname}`);
+ }
+}
+
async function crawlPage(url: string) {
assert(browser);
const context = await browser.createBrowserContext();
@@ -158,7 +174,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
logger.info(
`[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
);
- // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)
+ validateUrl(url);
const htmlContent = await crawlPage(url);