diff options
| -rw-r--r-- | README.md | 1 | ||||
| -rw-r--r-- | apps/workers/openaiWorker.ts | 22 | ||||
| -rw-r--r-- | apps/workers/package.json | 1 | ||||
| -rw-r--r-- | apps/workers/utils.ts | 22 | ||||
| -rw-r--r-- | docs/docs/01-intro.md | 1 | ||||
| -rw-r--r-- | docs/docs/03-configuration.md | 12 | ||||
| -rw-r--r-- | packages/shared/config.ts | 11 | ||||
| -rw-r--r-- | pnpm-lock.yaml | 71 |
8 files changed, 139 insertions, 2 deletions
@@ -23,6 +23,7 @@ A self-hostable bookmark-everything app with a touch of AI for the data hoarders - 📋 Sort your bookmarks into lists. - 🔎 Full text search of all the content stored. - ✨ AI-based (aka chatgpt) automatic tagging. With supports for local models using ollama! +- 🎆 OCR for extracting text from images. - 🔖 [Chrome plugin](https://chromewebstore.google.com/detail/hoarder/kgcjekpmcjjogibpjebkhaanilehneje) and [Firefox addon](https://addons.mozilla.org/en-US/firefox/addon/hoarder/) for quick bookmarking. - 📱 An [iOS app](https://apps.apple.com/us/app/hoarder-app/id6479258022), and an [Android app](https://play.google.com/store/apps/details?id=app.hoarder.hoardermobile&pcampaignid=web_share). - 🗄️ Full page archival (using [monolith](https://github.com/Y2Z/monolith)) to protect against link rot. diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts index d51771b2..f436f71b 100644 --- a/apps/workers/openaiWorker.ts +++ b/apps/workers/openaiWorker.ts @@ -23,7 +23,7 @@ import { import type { InferenceClient } from "./inference"; import { InferenceClientFactory } from "./inference"; -import { readPDFText } from "./utils"; +import { readImageText, readPDFText } from "./utils"; const openAIResponseSchema = z.object({ tags: z.array(z.string()), @@ -152,6 +152,26 @@ async function inferTagsFromImage( `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`, ); } + + let imageText = null; + try { + imageText = await readImageText(asset); + } catch (e) { + logger.error(`[inference][${jobId}] Failed to read image text: ${e}`); + } + + if (imageText) { + logger.info( + `[inference][${jobId}] Extracted ${imageText.length} characters from image.`, + ); + await db + .update(bookmarkAssets) + .set({ + content: imageText, + }) + .where(eq(bookmarkAssets.id, bookmark.id)); + } + const base64 = asset.toString("base64"); return inferenceClient.inferFromImage( buildImagePrompt( diff --git a/apps/workers/package.json b/apps/workers/package.json index 88e803fe..0ab7caa2 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -34,6 +34,7 @@ "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-stealth": "^2.11.2", + "tesseract.js": "^5.1.1", "tsx": "^4.7.1", "typescript": "^5.3.3", "zod": "^3.22.4" diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index 8d297e05..15634902 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,4 +1,8 @@ +import os from "os"; import PDFParser from "pdf2json"; +import { createWorker } from "tesseract.js"; + +import serverConfig from "@hoarder/shared/config"; export function withTimeout<T, Ret>( func: (param: T) => Promise<Ret>, @@ -17,6 +21,24 @@ export function withTimeout<T, Ret>( }; } +export async function readImageText(buffer: Buffer) { + if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") { + return null; + } + const worker = await createWorker(serverConfig.ocr.langs, undefined, { + cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(), + }); + try { + const ret = await worker.recognize(buffer); + if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) { + return null; + } + return ret.data.text; + } finally { + await worker.terminate(); + } +} + export async function readPDFText(buffer: Buffer): Promise<{ text: string; metadata: Record<string, string>; diff --git a/docs/docs/01-intro.md b/docs/docs/01-intro.md index 1e9dd263..d35585f8 100644 --- a/docs/docs/01-intro.md +++ b/docs/docs/01-intro.md @@ -16,6 +16,7 @@ Hoarder is an open source "Bookmark Everything" app that uses AI for automatical - 📋 Sort your bookmarks into lists. - 🔎 Full text search of all the content stored. - ✨ AI-based (aka chatgpt) automatic tagging. With supports for local models using ollama! +- 🎆 OCR for extracting text from images. - 🔖 [Chrome plugin](https://chromewebstore.google.com/detail/hoarder/kgcjekpmcjjogibpjebkhaanilehneje) and [Firefox addon](https://addons.mozilla.org/en-US/firefox/addon/hoarder/) for quick bookmarking. - 📱 An [iOS app](https://apps.apple.com/us/app/hoarder-app/id6479258022), and an [Android app](https://play.google.com/store/apps/details?id=app.hoarder.hoardermobile&pcampaignid=web_share). - 🗄️ Full page archival (using [monolith](https://github.com/Y2Z/monolith)) to protect against link rot. diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 9abd6fb2..8e66a407 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -26,7 +26,7 @@ When setting up OAuth, the allowed redirect URLs configured at the provider shou ::: | Name | Required | Default | Description | -| ------------------------------------------- | -------- | ---------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ------------------------------------------- | -------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | DISABLE_SIGNUPS | No | false | If enabled, no new signups will be allowed and the signup button will be disabled in the UI | | DISABLE_PASSWORD_AUTH | No | false | If enabled, only signups and logins using OAuth are allowed and the signup button and login form for local accounts will be disabled in the UI | | OAUTH_WELLKNOWN_URL | No | Not set | The "wellknown Url" for openid-configuration as provided by the OAuth provider | @@ -74,3 +74,13 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin | CRAWLER_FULL_PAGE_ARCHIVE | No | false | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived. | | CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | | CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | + +## OCR Configs + +Hoarder uses [tesseract.js](https://github.com/naptha/tesseract.js) to extract text from images. + +| Name | Required | Default | Description | +| ------------------------ | -------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| OCR_CACHE_DIR | No | $TEMP_DIR | The dir where tesseract will download its models. By default, those models are not persisted and stored in the OS' temp dir. | +| OCR_LANGS | No | eng | Comma separated list of the language codes that you want tesseract to support. You can find the language codes [here](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). Set to empty string to disable OCR. | +| OCR_CONFIDENCE_THRESHOLD | No | 50 | A number between 0 and 100 indicating the minimum acceptable confidence from tessaract. If tessaract's confidence is lower than this value, extracted text won't be stored. | diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 325d9ffa..41430566 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -25,6 +25,12 @@ const allEnv = z.object({ INFERENCE_TEXT_MODEL: z.string().default("gpt-4o-mini"), INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"), INFERENCE_CONTEXT_LENGTH: z.coerce.number().default(2048), + OCR_CACHE_DIR: z.string().optional(), + OCR_LANGS: z + .string() + .default("eng") + .transform((val) => val.split(",")), + OCR_CONFIDENCE_THRESHOLD: z.coerce.number().default(50), CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().url().optional(), BROWSER_WEBSOCKET_URL: z.string().url().optional(), @@ -90,6 +96,11 @@ const serverConfigSchema = allEnv.transform((val) => { fullPageScreenshot: val.CRAWLER_FULL_PAGE_SCREENSHOT, fullPageArchive: val.CRAWLER_FULL_PAGE_ARCHIVE, }, + ocr: { + langs: val.OCR_LANGS, + cacheDir: val.OCR_CACHE_DIR, + confidenceThreshold: val.OCR_CONFIDENCE_THRESHOLD, + }, meilisearch: val.MEILI_ADDR ? { address: val.MEILI_ADDR, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d068a1d8..4bc0d16b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -770,6 +770,9 @@ importers: puppeteer-extra-plugin-stealth: specifier: ^2.11.2 version: 2.11.2(puppeteer-extra@3.3.6(puppeteer@22.3.0(typescript@5.3.3))) + tesseract.js: + specifier: ^5.1.1 + version: 5.1.1 tsx: specifier: ^4.7.1 version: 4.7.1 @@ -5153,6 +5156,9 @@ packages: blueimp-md5@2.19.0: resolution: {integrity: sha512-DRQrD6gJyy8FbiE4s+bDoXS9hiW3Vbx5uCdwvcCf3zLHL+Iv7LtGHLpr+GZV8rHG8tK766FGYBwRbu8pELTt+w==} + bmp-js@0.1.0: + resolution: {integrity: sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==} + body-parser@1.20.2: resolution: {integrity: sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==} engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16} @@ -7771,6 +7777,9 @@ packages: peerDependencies: postcss: ^8.1.0 + idb-keyval@6.2.1: + resolution: {integrity: sha512-8Sb3veuYCyrZL+VBt9LJfZjLUPWVvqn8tG28VqYNFCo43KHcKuq+b4EiXGeuaLAQWL2YmyDgMp2aSpH9JHsEQg==} + idb@7.1.1: resolution: {integrity: sha512-gchesWBzyvGHRO9W8tzUWFDycow5gwjvFKfyV9FF32Y7F50yZMp7mP+T2mJIWFx49zicqyC4uefHM17o6xKIVQ==} @@ -7970,6 +7979,9 @@ packages: engines: {node: '>=8'} hasBin: true + is-electron@2.2.2: + resolution: {integrity: sha512-FO/Rhvz5tuw4MCWkpMzHFKWD2LsfHzIb7i6MdPYZ/KW7AlxawyLkqdy+jPZP1WubqEADE3O4FUENlJHDfQASRg==} + is-extendable@0.1.1: resolution: {integrity: sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==} engines: {node: '>=0.10.0'} @@ -8167,6 +8179,9 @@ packages: resolution: {integrity: sha512-kNciklu//Ki8BUmRseLTfG/WW55qDHavf3MKUic8wvXR3d7etbSMoQPTpjvDeLVekESSgJM4AG+BESIKU02u3A==} engines: {node: '>= 4'} + is-url@1.2.4: + resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==} + is-valid-path@0.1.1: resolution: {integrity: sha512-+kwPrVDu9Ms03L90Qaml+79+6DZHqHyRoANI6IsZJ/g8frhnfchDOBCa0RbQ6/kdHt5CS5OeIEyrYznNuVN+8A==} engines: {node: '>=0.10.0'} @@ -9725,6 +9740,10 @@ packages: openapi3-ts@4.4.0: resolution: {integrity: sha512-9asTNB9IkKEzWMcHmVZE7Ts3kC9G7AFHfs8i7caD8HbI76gEjdkId4z/AkP83xdZsH7PLAnnbl47qZkXuxpArw==} + opencollective-postinstall@2.0.3: + resolution: {integrity: sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==} + hasBin: true + opener@1.5.2: resolution: {integrity: sha512-ur5UIdyw5Y7yEj9wLzhqXiy6GZ3Mwx0yGI+5sMn2r0N0v3cKJvUmFH5yPP+WXh9e0xfyzyJX95D8l088DNFj7A==} hasBin: true @@ -12010,6 +12029,12 @@ packages: engines: {node: '>=10'} hasBin: true + tesseract.js-core@5.1.1: + resolution: {integrity: sha512-KX3bYSU5iGcO1XJa+QGPbi+Zjo2qq6eBhNjSGR5E5q0JtzkoipJKOUQD7ph8kFyteCEfEQ0maWLu8MCXtvX5uQ==} + + tesseract.js@5.1.1: + resolution: {integrity: sha512-lzVl/Ar3P3zhpUT31NjqeCo1f+D5+YfpZ5J62eo2S14QNVOmHBTtbchHm/YAbOOOzCegFnKf4B3Qih9LuldcYQ==} + text-hex@1.0.0: resolution: {integrity: sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==} @@ -12633,6 +12658,9 @@ packages: warn-once@0.1.1: resolution: {integrity: sha512-VkQZJbO8zVImzYFteBXvBOZEl1qL175WH8VmZcxF2fZAoudNhNDvHi+doCaAEdU2l2vtcIwa2zn0QK5+I1HQ3Q==} + wasm-feature-detect@1.8.0: + resolution: {integrity: sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==} + watchpack@2.4.0: resolution: {integrity: sha512-Lcvm7MGST/4fup+ifyKi2hjyIAwcdI4HRgtvTpIUxBRhB+RFtUh8XtDOxUfctVCnhVi+QQj49i91OyvzkJl6cg==} engines: {node: '>=10.13.0'} @@ -13044,6 +13072,9 @@ packages: resolution: {integrity: sha512-Ct97huExsu7cWeEjmrXlofevF8CvzUglJ4iGUet5B8xn1oumtAZBpHU4GzYuoE6PVqcZ5hghtBrSlhwHuR1Jmw==} engines: {node: '>=18'} + zlibjs@0.3.1: + resolution: {integrity: sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==} + zod@3.22.4: resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} @@ -19970,6 +20001,9 @@ snapshots: blueimp-md5@2.19.0: dev: false + bmp-js@0.1.0: + dev: false + body-parser@1.20.2: dependencies: bytes: 3.1.2 @@ -23733,6 +23767,9 @@ snapshots: postcss: 8.4.47 dev: false + idb-keyval@6.2.1: + dev: false + idb@7.1.1: dev: false @@ -23931,6 +23968,9 @@ snapshots: is-docker@2.2.1: dev: false + is-electron@2.2.2: + dev: false + is-extendable@0.1.1: dev: false @@ -24106,6 +24146,9 @@ snapshots: punycode2: 1.0.1 dev: false + is-url@1.2.4: + dev: false + is-valid-path@0.1.1: dependencies: is-invalid-path: 0.1.0 @@ -26505,6 +26548,9 @@ snapshots: yaml: 2.6.0 dev: false + opencollective-postinstall@2.0.3: + dev: false + opener@1.5.2: dev: false @@ -29579,6 +29625,25 @@ snapshots: commander: 2.20.3 source-map-support: 0.5.21 + tesseract.js-core@5.1.1: + dev: false + + tesseract.js@5.1.1: + dependencies: + bmp-js: 0.1.0 + idb-keyval: 6.2.1 + is-electron: 2.2.2 + is-url: 1.2.4 + node-fetch: 2.7.0 + opencollective-postinstall: 2.0.3 + regenerator-runtime: 0.13.11 + tesseract.js-core: 5.1.1 + wasm-feature-detect: 1.8.0 + zlibjs: 0.3.1 + transitivePeerDependencies: + - encoding + dev: false + text-hex@1.0.0: dev: false @@ -30269,6 +30334,9 @@ snapshots: warn-once@0.1.1: dev: false + wasm-feature-detect@1.8.0: + dev: false + watchpack@2.4.0: dependencies: glob-to-regexp: 0.4.1 @@ -30895,6 +30963,9 @@ snapshots: yoctocolors@2.0.2: dev: false + zlibjs@0.3.1: + dev: false + zod@3.22.4: {} zustand@4.5.1(@types/react@18.2.58)(react@18.2.0): |
