aboutsummaryrefslogtreecommitdiffstats
path: root/apps/workers/crawlerWorker.ts
diff options
context:
space:
mode:
Diffstat (limited to 'apps/workers/crawlerWorker.ts')
-rw-r--r--apps/workers/crawlerWorker.ts10
1 files changed, 6 insertions, 4 deletions
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index 3952a287..208de44b 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -241,14 +241,12 @@ async function browserlessCrawlPage(jobId: string, url: string) {
const response = await fetch(url, {
signal: AbortSignal.timeout(5000),
});
- if (!response.ok) {
- throw new Error(`Failed to crawl page: ${response.status}`);
- }
logger.info(
`[Crawler][${jobId}] Successfully fetched the content of "${url}". Status: ${response.status}, Size: ${response.size}`,
);
return {
htmlContent: await response.text(),
+ statusCode: response.status,
screenshot: undefined,
url: response.url,
};
@@ -260,6 +258,7 @@ async function crawlPage(
): Promise<{
htmlContent: string;
screenshot: Buffer | undefined;
+ statusCode: number;
url: string;
}> {
let browser: Browser | undefined;
@@ -282,7 +281,7 @@ async function crawlPage(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
);
- await page.goto(url, {
+ const response = await page.goto(url, {
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
});
logger.info(
@@ -328,6 +327,7 @@ async function crawlPage(
return {
htmlContent,
+ statusCode: response?.status() ?? 0,
screenshot,
url: page.url(),
};
@@ -583,6 +583,7 @@ async function crawlAndParseUrl(
const {
htmlContent,
screenshot,
+ statusCode,
url: browserUrl,
} = await crawlPage(jobId, url);
@@ -618,6 +619,7 @@ async function crawlAndParseUrl(
content: readableContent?.textContent,
htmlContent: readableContent?.content,
crawledAt: new Date(),
+ crawlStatusCode: statusCode,
})
.where(eq(bookmarkLinks.id, bookmarkId));