diff options
| author | MohamedBassem <me@mbassem.com> | 2024-03-21 02:35:39 +0000 |
|---|---|---|
| committer | MohamedBassem <me@mbassem.com> | 2024-03-21 02:35:39 +0000 |
| commit | 71707500f6e59b61f0a97fac7b2caaa0af4e3168 (patch) | |
| tree | ed29b64620bdb1d5ba8194ff19903ae4236abf56 /apps/workers/crawlerWorker.ts | |
| parent | 7d7d3754d33b41478fea2d2d7ed902d665a9e03d (diff) | |
| download | karakeep-71707500f6e59b61f0a97fac7b2caaa0af4e3168.tar.zst | |
fix: Simple validations for crawled URLs
Diffstat (limited to '')
| -rw-r--r-- | apps/workers/crawlerWorker.ts | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 5db2da7b..ecd8d146 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -113,6 +113,22 @@ async function getBookmarkUrl(bookmarkId: string) { return bookmark.url; } +/** + * This provides some "basic" protection from malicious URLs. However, all of those + * can be easily circumvented by pointing dns of origin to localhost, or with + * redirects. + */ +function validateUrl(url: string) { + const urlParsed = new URL(url); + if (urlParsed.protocol != "http:" && urlParsed.protocol != "https:") { + throw new Error(`Unsupported URL protocol: ${urlParsed.protocol}`); + } + + if (["localhost", "127.0.0.1", "0.0.0.0"].includes(urlParsed.hostname)) { + throw new Error(`Link hostname rejected: ${urlParsed.hostname}`); + } +} + async function crawlPage(url: string) { assert(browser); const context = await browser.createBrowserContext(); @@ -158,7 +174,7 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { logger.info( `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, ); - // TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc) + validateUrl(url); const htmlContent = await crawlPage(url); |
