Files
OpenArchiver/services/ocr-service.html
2026-03-30 20:29:40 +00:00

31 lines
36 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en-US" dir="ltr">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>OCR Service | Open Archiver Docs</title>
<meta name="description" content="Official documentation for the Open Archiver project.">
<meta name="generator" content="VitePress v1.6.4">
<link rel="preload stylesheet" href="/assets/style.BRVqMv4m.css" as="style">
<link rel="preload stylesheet" href="/vp-icons.css" as="style">
<script type="module" src="/assets/app.BneHbokF.js"></script>
<link rel="preload" href="/assets/inter-roman-latin.Di8DUHzh.woff2" as="font" type="font/woff2" crossorigin="">
<link rel="modulepreload" href="/assets/chunks/theme.DMwvsXz4.js">
<link rel="modulepreload" href="/assets/chunks/framework.DHqKNX4U.js">
<link rel="modulepreload" href="/assets/services_ocr-service.md.B6-nFSTJ.lean.js">
<script defer src="https://analytics.openarchiver.com/script.js" data-website-id="2c8b452e-eab5-4f82-8ead-902d8f8b976f"></script>
<link rel="icon" href="/logo-sq.svg">
<script id="check-dark-mode">(()=>{const e=localStorage.getItem("vitepress-theme-appearance")||"auto",a=window.matchMedia("(prefers-color-scheme: dark)").matches;(!e||e==="auto"?a:e==="dark")&&document.documentElement.classList.add("dark")})();</script>
<script id="check-mac-os">document.documentElement.classList.toggle("mac",/Mac|iPhone|iPod|iPad/i.test(navigator.platform));</script>
</head>
<body>
<div id="app"><div class="Layout" data-v-6d457d7f><!--[--><!--]--><!--[--><span tabindex="-1" data-v-9f15b6e6></span><a href="#VPContent" class="VPSkipLink visually-hidden" data-v-9f15b6e6>Skip to content</a><!--]--><!----><header class="VPNav" data-v-6d457d7f data-v-06aeb22c><div class="VPNavBar" data-v-06aeb22c data-v-9f44cfca><div class="wrapper" data-v-9f44cfca><div class="container" data-v-9f44cfca><div class="title" data-v-9f44cfca><div class="VPNavBarTitle has-sidebar" data-v-9f44cfca data-v-cea99ba2><a class="title" href="/" data-v-cea99ba2><!--[--><!--]--><!--[--><img class="VPImage logo" src="/logo-sq.svg" alt data-v-384abc6c><!--]--><span data-v-cea99ba2>Open Archiver Docs</span><!--[--><!--]--></a></div></div><div class="content" data-v-9f44cfca><div class="content-body" data-v-9f44cfca><!--[--><!--]--><div class="VPNavBarSearch search" data-v-9f44cfca><!--[--><!----><div id="local-search"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><span class="vp-icon DocSearch-Search-Icon"></span><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"><kbd class="DocSearch-Button-Key"></kbd><kbd class="DocSearch-Button-Key">K</kbd></span></button></div><!--]--></div><nav aria-labelledby="main-nav-aria-label" class="VPNavBarMenu menu" data-v-9f44cfca data-v-bc890e2b><span id="main-nav-aria-label" class="visually-hidden" data-v-bc890e2b> Main Navigation </span><!--[--><!--[--><a class="VPLink link VPNavBarMenuLink" href="/" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Home</span><!--]--></a><!--]--><!--[--><a class="VPLink link vp-external-link-icon VPNavBarMenuLink" href="https://github.com/LogicLabs-OU/OpenArchiver" target="_blank" rel="noreferrer" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Github</span><!--]--></a><!--]--><!--[--><a class="VPLink link vp-external-link-icon VPNavBarMenuLink" href="https://openarchiver.com/" target="_blank" rel="noreferrer" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Website</span><!--]--></a><!--]--><!--[--><a class="VPLink link vp-external-link-icon VPNavBarMenuLink" href="https://discord.gg/MTtD7BhuTQ" target="_blank" rel="noreferrer" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Discord</span><!--]--></a><!--]--><!--]--></nav><!----><div class="VPNavBarAppearance appearance" data-v-9f44cfca data-v-1a8fd632><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-1a8fd632 data-v-77125d61 data-v-44c0c9c3><span class="check" data-v-44c0c9c3><span class="icon" data-v-44c0c9c3><!--[--><span class="vpi-sun sun" data-v-77125d61></span><span class="vpi-moon moon" data-v-77125d61></span><!--]--></span></span></button></div><!----><div class="VPFlyout VPNavBarExtra extra" data-v-9f44cfca data-v-cb22ae83 data-v-ce17dc3c><button type="button" class="button" aria-haspopup="true" aria-expanded="false" aria-label="extra navigation" data-v-ce17dc3c><span class="vpi-more-horizontal icon" data-v-ce17dc3c></span></button><div class="menu" data-v-ce17dc3c><div class="VPMenu" data-v-ce17dc3c data-v-a0929307><!----><!--[--><!--[--><!----><div class="group" data-v-cb22ae83><div class="item appearance" data-v-cb22ae83><p class="label" data-v-cb22ae83>Appearance</p><div class="appearance-action" data-v-cb22ae83><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-cb22ae83 data-v-77125d61 data-v-44c0c9c3><span class="check" data-v-44c0c9c3><span class="icon" data-v-44c0c9c3><!--[--><span class="vpi-sun sun" data-v-77125d61></span><span class="vpi-moon moon" data-v-77125d61></span><!--]--></span></span></button></div></div></div><!----><!--]--><!--]--></div></div></div><!--[--><!--]--><button type="button" class="VPNavBarHamburger hamburger" aria-label="mobile navigation" aria-expanded="false" aria-controls="VPNavScreen" data-v-9f44cfca data-v-42f375ed><span class="container" data-v-42f375ed><span class="top" data-v-42f375ed></span><span class="middle" data-v-42f375ed></span><span class="bottom" data-v-42f375ed></span></span></button></div></div></div></div><div class="divider" data-v-9f44cfca><div class="divider-line" data-v-9f44cfca></div></div></div><!----></header><div class="VPLocalNav has-sidebar empty" data-v-6d457d7f data-v-e17e33c6><div class="container" data-v-e17e33c6><button class="menu" aria-expanded="false" aria-controls="VPSidebarNav" data-v-e17e33c6><span class="vpi-align-left menu-icon" data-v-e17e33c6></span><span class="menu-text" data-v-e17e33c6>Menu</span></button><div class="VPLocalNavOutlineDropdown" style="--vp-vh:0px;" data-v-e17e33c6 data-v-bab32157><button data-v-bab32157>Return to top</button><!----></div></div></div><aside class="VPSidebar" data-v-6d457d7f data-v-91b447a8><div class="curtain" data-v-91b447a8></div><nav class="nav" id="VPSidebarNav" aria-labelledby="sidebar-aria-label" tabindex="-1" data-v-91b447a8><span class="visually-hidden" id="sidebar-aria-label" data-v-91b447a8> Sidebar Navigation </span><!--[--><!--]--><!--[--><div class="no-transition group" data-v-6b998b36><section class="VPSidebarItem level-0" data-v-6b998b36 data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h2 class="text" data-v-5f79b55d>User Guides</h2><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Get Started</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/installation.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Installation</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/integrity-check.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Email Integrity Check</p><!--]--></a><!----></div><!----></div><section class="VPSidebarItem level-1 collapsible collapsed is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/" data-v-5f79b55d><!--[--><h3 class="text" data-v-5f79b55d>Email Providers</h3><!--]--></a><div class="caret" role="button" aria-label="toggle section" tabindex="0" data-v-5f79b55d><span class="vpi-chevron-right caret-icon" data-v-5f79b55d></span></div></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/imap.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Generic IMAP Server</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/google-workspace.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Google Workspace</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/microsoft-365.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Microsoft 365</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/eml.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>EML Import</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/pst.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>PST Import</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/mbox.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Mbox Import</p><!--]--></a><!----></div><!----></div><!--]--></div></section><section class="VPSidebarItem level-1 collapsible collapsed" data-v-5f79b55d data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h3 class="text" data-v-5f79b55d>Settings</h3><div class="caret" role="button" aria-label="toggle section" tabindex="0" data-v-5f79b55d><span class="vpi-chevron-right caret-icon" data-v-5f79b55d></span></div></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/settings/system.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>System</p><!--]--></a><!----></div><!----></div><!--]--></div></section><section class="VPSidebarItem level-1 collapsible collapsed" data-v-5f79b55d data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h3 class="text" data-v-5f79b55d>Upgrading and Migration</h3><div class="caret" role="button" aria-label="toggle section" tabindex="0" data-v-5f79b55d><span class="vpi-chevron-right caret-icon" data-v-5f79b55d></span></div></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/upgrade-and-migration/upgrade.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Upgrading</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/upgrade-and-migration/meilisearch-upgrade.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Meilisearch Upgrade</p><!--]--></a><!----></div><!----></div><!--]--></div></section><!--]--></div></section></div><div class="no-transition group" data-v-6b998b36><section class="VPSidebarItem level-0" data-v-6b998b36 data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h2 class="text" data-v-5f79b55d>API Reference</h2><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Overview</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/authentication.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Authentication</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/rate-limiting.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Rate Limiting</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/auth.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Auth</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/archived-email.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Archived Email</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/dashboard.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Dashboard</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/ingestion.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Ingestion</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/integrity.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Integrity Check</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/search.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Search</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/storage.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Storage</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/upload.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Upload</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/jobs.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Jobs</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/users.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Users</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/iam.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>IAM</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/api-keys.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>API Keys</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/settings.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Settings</p><!--]--></a><!----></div><!----></div><!--]--></div></section></div><div class="no-transition group" data-v-6b998b36><section class="VPSidebarItem level-0 has-active" data-v-6b998b36 data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h2 class="text" data-v-5f79b55d>Services</h2><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Overview</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/storage-service.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Storage Service</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/ocr-service.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>OCR Service</p><!--]--></a><!----></div><!----></div><section class="VPSidebarItem level-1" data-v-5f79b55d data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h3 class="text" data-v-5f79b55d>IAM Service</h3><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/iam-service/iam-policy.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>IAM Policies</p><!--]--></a><!----></div><!----></div><!--]--></div></section><!--]--></div></section></div><!--]--><!--[--><!--]--></nav></aside><div class="VPContent has-sidebar" id="VPContent" data-v-6d457d7f data-v-413f3f32><div class="VPDoc has-sidebar has-aside" data-v-413f3f32 data-v-29334b8c><!--[--><!--]--><div class="container" data-v-29334b8c><div class="aside" data-v-29334b8c><div class="aside-curtain" data-v-29334b8c></div><div class="aside-container" data-v-29334b8c><div class="aside-content" data-v-29334b8c><div class="VPDocAside" data-v-29334b8c data-v-0970baee><!--[--><!--]--><!--[--><!--]--><nav aria-labelledby="doc-outline-aria-label" class="VPDocAsideOutline" data-v-0970baee data-v-1cf7166c><div class="content" data-v-1cf7166c><div class="outline-marker" data-v-1cf7166c></div><div aria-level="2" class="outline-title" id="doc-outline-aria-label" role="heading" data-v-1cf7166c>On this page</div><ul class="VPDocOutlineItem root" data-v-1cf7166c data-v-c6ed6775><!--[--><!--]--></ul></div></nav><!--[--><!--]--><div class="spacer" data-v-0970baee></div><!--[--><!--]--><!----><!--[--><!--]--><!--[--><!--]--></div></div></div></div><div class="content" data-v-29334b8c><div class="content-container" data-v-29334b8c><!--[--><!--]--><main class="main" data-v-29334b8c><div style="position:relative;" class="vp-doc _services_ocr-service" data-v-29334b8c><div><h1 id="ocr-service" tabindex="-1">OCR Service <a class="header-anchor" href="#ocr-service" aria-label="Permalink to &quot;OCR Service&quot;"></a></h1><p>The OCR (Optical Character Recognition) and text extraction service is responsible for extracting plain text content from various file formats, such as PDFs, Office documents, and more. This is a crucial component for making email attachments searchable.</p><h2 id="overview" tabindex="-1">Overview <a class="header-anchor" href="#overview" aria-label="Permalink to &quot;Overview&quot;"></a></h2><p>The system employs a two-pronged approach for text extraction:</p><ol><li><strong>Primary Extractor (Apache Tika)</strong>: A powerful and versatile toolkit that can extract text from a wide variety of file formats. It is the recommended method for its superior performance and format support.</li><li><strong>Legacy Extractor</strong>: A fallback mechanism that uses a combination of libraries (<code>pdf2json</code>, <code>mammoth</code>, <code>xlsx</code>) for common file types like PDF, DOCX, and XLSX. This is used when Apache Tika is not configured.</li></ol><p>The main logic resides in <code>packages/backend/src/helpers/textExtractor.ts</code>, which decides which extraction method to use based on the application&#39;s configuration.</p><h2 id="configuration" tabindex="-1">Configuration <a class="header-anchor" href="#configuration" aria-label="Permalink to &quot;Configuration&quot;"></a></h2><p>To enable the primary text extraction method, you must configure the URL of an Apache Tika server instance in your environment variables.</p><p>In your <code>.env</code> file, set the <code>TIKA_URL</code>:</p><div class="language-env vp-adaptive-theme"><button title="Copy Code" class="copy"></button><span class="lang">env</span><pre class="shiki shiki-themes github-light github-dark vp-code" tabindex="0"><code><span class="line"><span># .env.example</span></span>
<span class="line"><span></span></span>
<span class="line"><span># Apache Tika Integration</span></span>
<span class="line"><span># ONLY active if TIKA_URL is set</span></span>
<span class="line"><span>TIKA_URL=http://tika:9998</span></span></code></pre></div><p>If <code>TIKA_URL</code> is not set, the system will automatically fall back to the legacy extraction methods. The service performs a health check on startup to verify connectivity with the Tika server.</p><h2 id="file-size-limits" tabindex="-1">File Size Limits <a class="header-anchor" href="#file-size-limits" aria-label="Permalink to &quot;File Size Limits&quot;"></a></h2><p>To prevent excessive memory usage and processing time, the service imposes a general size limit on files submitted for text extraction. Files larger than the configured limit will be skipped.</p><ul><li><strong>With Apache Tika</strong>: The maximum file size is <strong>100MB</strong>.</li><li><strong>With Legacy Fallback</strong>: The maximum file size is <strong>50MB</strong>.</li></ul><h2 id="supported-file-formats" tabindex="-1">Supported File Formats <a class="header-anchor" href="#supported-file-formats" aria-label="Permalink to &quot;Supported File Formats&quot;"></a></h2><p>The service&#39;s ability to extract text depends on whether it&#39;s using Apache Tika or the legacy fallback methods.</p><h3 id="with-apache-tika" tabindex="-1">With Apache Tika <a class="header-anchor" href="#with-apache-tika" aria-label="Permalink to &quot;With Apache Tika&quot;"></a></h3><p>When <code>TIKA_URL</code> is configured, the service can process a vast range of file formats. Apache Tika is designed for broad compatibility and supports hundreds of file types, including but not limited to:</p><ul><li>Portable Document Format (PDF)</li><li>Microsoft Office formats (DOC, DOCX, PPT, PPTX, XLS, XLSX)</li><li>OpenDocument Formats (ODT, ODS, ODP)</li><li>Rich Text Format (RTF)</li><li>Plain Text (TXT, CSV, JSON, XML, HTML)</li><li>Image formats with OCR capabilities (PNG, JPEG, TIFF)</li><li>Archive formats (ZIP, TAR, GZ)</li><li>Email formats (EML, MSG)</li></ul><p>For a complete and up-to-date list, please refer to the official <a href="https://tika.apache.org/3.2.3/formats.html" target="_blank" rel="noreferrer">Apache Tika documentation</a>.</p><h3 id="with-legacy-fallback" tabindex="-1">With Legacy Fallback <a class="header-anchor" href="#with-legacy-fallback" aria-label="Permalink to &quot;With Legacy Fallback&quot;"></a></h3><p>When Tika is not configured, text extraction is limited to the following formats:</p><ul><li><code>application/pdf</code> (PDF)</li><li><code>application/vnd.openxmlformats-officedocument.wordprocessingml.document</code> (DOCX)</li><li><code>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</code> (XLSX)</li><li>Plain text formats such as <code>text/*</code>, <code>application/json</code>, and <code>application/xml</code>.</li></ul><h2 id="features-of-the-tika-integration-ocrservice" tabindex="-1">Features of the Tika Integration (<code>OcrService</code>) <a class="header-anchor" href="#features-of-the-tika-integration-ocrservice" aria-label="Permalink to &quot;Features of the Tika Integration (`OcrService`)&quot;"></a></h2><p>The <code>OcrService</code> (<code>packages/backend/src/services/OcrService.ts</code>) provides several enhancements to make text extraction efficient and robust.</p><h3 id="caching" tabindex="-1">Caching <a class="header-anchor" href="#caching" aria-label="Permalink to &quot;Caching&quot;"></a></h3><p>To avoid redundant processing of the same file, the service implements a simple LRU (Least Recently Used) cache.</p><ul><li><strong>Cache Key</strong>: A SHA-256 hash of the file&#39;s buffer is used as the cache key.</li><li><strong>Functionality</strong>: If a file with the same hash is processed again, the text content is served directly from the cache, saving significant processing time.</li><li><strong>Statistics</strong>: The service keeps track of cache hits, misses, and the hit rate for performance monitoring.</li></ul><h3 id="concurrency-management-semaphore" tabindex="-1">Concurrency Management (Semaphore) <a class="header-anchor" href="#concurrency-management-semaphore" aria-label="Permalink to &quot;Concurrency Management (Semaphore)&quot;"></a></h3><p>Extracting text from large files can be resource-intensive. To prevent the Tika server from being overwhelmed by multiple requests for the <em>same file</em> simultaneously (e.g., during a large import), a semaphore mechanism is used.</p><ul><li><strong>Functionality</strong>: If a request for a specific file (identified by its hash) is already in progress, any subsequent requests for the same file will wait for the first one to complete and then use its result.</li><li><strong>Benefit</strong>: This deduplicates parallel processing efforts and reduces unnecessary load on the Tika server.</li></ul><h3 id="health-check-and-dns-fallback" tabindex="-1">Health Check and DNS Fallback <a class="header-anchor" href="#health-check-and-dns-fallback" aria-label="Permalink to &quot;Health Check and DNS Fallback&quot;"></a></h3><ul><li><strong>Availability Check</strong>: The service includes a <code>checkTikaAvailability</code> method to verify that the Tika server is reachable and operational. This check is performed on application startup.</li><li><strong>DNS Fallback</strong>: For convenience in Docker environments, if the Tika URL uses the hostname <code>tika</code> (e.g., <code>http://tika:9998</code>), the service will automatically attempt a fallback to <code>localhost</code> if the initial connection fails.</li></ul><h2 id="legacy-fallback-methods" tabindex="-1">Legacy Fallback Methods <a class="header-anchor" href="#legacy-fallback-methods" aria-label="Permalink to &quot;Legacy Fallback Methods&quot;"></a></h2><p>When Tika is not available, the <code>extractTextLegacy</code> function in <code>textExtractor.ts</code> handles extraction for a limited set of MIME types:</p><ul><li><code>application/pdf</code>: Processed using <code>pdf2json</code>. Includes a 50MB size limit and a 5-second timeout to prevent memory issues.</li><li><code>application/vnd.openxmlformats-officedocument.wordprocessingml.document</code> (DOCX): Processed using <code>mammoth</code>.</li><li><code>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</code> (XLSX): Processed using <code>xlsx</code>.</li><li>Plain text formats (<code>text/*</code>, <code>application/json</code>, <code>application/xml</code>): Converted directly from the buffer.</li></ul></div></div></main><footer class="VPDocFooter" data-v-29334b8c data-v-a014c1de><!--[--><!--]--><!----><nav class="prev-next" aria-labelledby="doc-footer-aria-label" data-v-a014c1de><span class="visually-hidden" id="doc-footer-aria-label" data-v-a014c1de>Pager</span><div class="pager" data-v-a014c1de><a class="VPLink link pager-link prev" href="/services/storage-service.html" data-v-a014c1de><!--[--><span class="desc" data-v-a014c1de>Previous page</span><span class="title" data-v-a014c1de>Storage Service</span><!--]--></a></div><div class="pager" data-v-a014c1de><a class="VPLink link pager-link next" href="/services/iam-service/iam-policy.html" data-v-a014c1de><!--[--><span class="desc" data-v-a014c1de>Next page</span><span class="title" data-v-a014c1de>IAM Policies</span><!--]--></a></div></nav></footer><!--[--><!--]--></div></div></div><!--[--><!--]--></div></div><!----><!--[--><!--]--></div></div>
<script>window.__VP_HASH_MAP__=JSON.parse("{\"api_api-keys.md\":\"Dn21HeSJ\",\"api_archived-email.md\":\"BT-mHBqK\",\"api_auth.md\":\"D-468dKx\",\"api_authentication.md\":\"3ZX-rlxc\",\"api_dashboard.md\":\"CSiI_2IK\",\"api_iam.md\":\"B5FbdpjA\",\"api_index.md\":\"B_r30QlK\",\"api_ingestion.md\":\"D362LBU_\",\"api_integrity.md\":\"B9VpMKD6\",\"api_jobs.md\":\"CTasZcNL\",\"api_rate-limiting.md\":\"BqhkXaUd\",\"api_search.md\":\"Cu20yO7t\",\"api_settings.md\":\"CwhPvkZf\",\"api_storage.md\":\"CTn9PQaD\",\"api_upload.md\":\"CfCXTOWM\",\"api_users.md\":\"BdQul9nH\",\"enterprise_audit-log_api.md\":\"jQPQ8cO8\",\"enterprise_audit-log_audit-service.md\":\"D9higq7j\",\"enterprise_audit-log_guide.md\":\"DuHTEaId\",\"enterprise_audit-log_index.md\":\"By894rqA\",\"enterprise_legal-holds_api.md\":\"E6pKaGoK\",\"enterprise_legal-holds_guide.md\":\"4JCnApXO\",\"enterprise_legal-holds_index.md\":\"CXivackd\",\"enterprise_retention-labels_api.md\":\"De0lRuGD\",\"enterprise_retention-labels_automated-tagging.md\":\"uMCViAe6\",\"enterprise_retention-labels_guide.md\":\"LMnSU6ly\",\"enterprise_retention-labels_index.md\":\"gaps1eMo\",\"enterprise_retention-policy_api.md\":\"Dv1_ca5K\",\"enterprise_retention-policy_guide.md\":\"BxhZekoC\",\"enterprise_retention-policy_index.md\":\"C_qinQpQ\",\"enterprise_retention-policy_lifecycle-worker.md\":\"CLFQ4ZEf\",\"enterprise_retention-policy_retention-service.md\":\"0HsJnlOP\",\"index.md\":\"ChFzL1lA\",\"services_iam-service_iam-policy.md\":\"DSOajx7I\",\"services_index.md\":\"BmWnT3gS\",\"services_job-queue.md\":\"_FwJJttJ\",\"services_ocr-service.md\":\"B6-nFSTJ\",\"services_storage-service.md\":\"Bpdj_lG7\",\"summary.md\":\"C_r3wuBH\",\"user-guides_email-providers_eml.md\":\"DV6Zs6RI\",\"user-guides_email-providers_google-workspace.md\":\"CaNRUOL2\",\"user-guides_email-providers_imap.md\":\"CQ8xQOnx\",\"user-guides_email-providers_index.md\":\"D2pQ89-G\",\"user-guides_email-providers_mbox.md\":\"B68Wejyx\",\"user-guides_email-providers_merging-sources.md\":\"D0DI--4D\",\"user-guides_email-providers_microsoft-365.md\":\"BNGY0WTw\",\"user-guides_email-providers_pst.md\":\"Dech_zDS\",\"user-guides_installation.md\":\"CYbox6PW\",\"user-guides_integrity-check.md\":\"BfM1nsWe\",\"user-guides_settings_system.md\":\"BI9F6Lpw\",\"user-guides_troubleshooting_cors-errors.md\":\"wu2DtNUw\",\"user-guides_upgrade-and-migration_meilisearch-upgrade.md\":\"BTQMSseX\",\"user-guides_upgrade-and-migration_upgrade.md\":\"DlOu8Pon\"}");window.__VP_SITE_DATA__=JSON.parse("{\"lang\":\"en-US\",\"dir\":\"ltr\",\"title\":\"Open Archiver Docs\",\"description\":\"Official documentation for the Open Archiver project.\",\"base\":\"/\",\"head\":[],\"router\":{\"prefetchLinks\":true},\"appearance\":true,\"themeConfig\":{\"search\":{\"provider\":\"local\"},\"logo\":{\"src\":\"/logo-sq.svg\"},\"nav\":[{\"text\":\"Home\",\"link\":\"/\"},{\"text\":\"Github\",\"link\":\"https://github.com/LogicLabs-OU/OpenArchiver\"},{\"text\":\"Website\",\"link\":\"https://openarchiver.com/\"},{\"text\":\"Discord\",\"link\":\"https://discord.gg/MTtD7BhuTQ\"}],\"sidebar\":[{\"text\":\"User Guides\",\"items\":[{\"text\":\"Get Started\",\"link\":\"/\"},{\"text\":\"Installation\",\"link\":\"/user-guides/installation\"},{\"text\":\"Email Integrity Check\",\"link\":\"/user-guides/integrity-check\"},{\"text\":\"Email Providers\",\"link\":\"/user-guides/email-providers/\",\"collapsed\":true,\"items\":[{\"text\":\"Generic IMAP Server\",\"link\":\"/user-guides/email-providers/imap\"},{\"text\":\"Google Workspace\",\"link\":\"/user-guides/email-providers/google-workspace\"},{\"text\":\"Microsoft 365\",\"link\":\"/user-guides/email-providers/microsoft-365\"},{\"text\":\"EML Import\",\"link\":\"/user-guides/email-providers/eml\"},{\"text\":\"PST Import\",\"link\":\"/user-guides/email-providers/pst\"},{\"text\":\"Mbox Import\",\"link\":\"/user-guides/email-providers/mbox\"}]},{\"text\":\"Settings\",\"collapsed\":true,\"items\":[{\"text\":\"System\",\"link\":\"/user-guides/settings/system\"}]},{\"text\":\"Upgrading and Migration\",\"collapsed\":true,\"items\":[{\"text\":\"Upgrading\",\"link\":\"/user-guides/upgrade-and-migration/upgrade\"},{\"text\":\"Meilisearch Upgrade\",\"link\":\"/user-guides/upgrade-and-migration/meilisearch-upgrade\"}]}]},{\"text\":\"API Reference\",\"items\":[{\"text\":\"Overview\",\"link\":\"/api/\"},{\"text\":\"Authentication\",\"link\":\"/api/authentication\"},{\"text\":\"Rate Limiting\",\"link\":\"/api/rate-limiting\"},{\"text\":\"Auth\",\"link\":\"/api/auth\"},{\"text\":\"Archived Email\",\"link\":\"/api/archived-email\"},{\"text\":\"Dashboard\",\"link\":\"/api/dashboard\"},{\"text\":\"Ingestion\",\"link\":\"/api/ingestion\"},{\"text\":\"Integrity Check\",\"link\":\"/api/integrity\"},{\"text\":\"Search\",\"link\":\"/api/search\"},{\"text\":\"Storage\",\"link\":\"/api/storage\"},{\"text\":\"Upload\",\"link\":\"/api/upload\"},{\"text\":\"Jobs\",\"link\":\"/api/jobs\"},{\"text\":\"Users\",\"link\":\"/api/users\"},{\"text\":\"IAM\",\"link\":\"/api/iam\"},{\"text\":\"API Keys\",\"link\":\"/api/api-keys\"},{\"text\":\"Settings\",\"link\":\"/api/settings\"}]},{\"text\":\"Services\",\"items\":[{\"text\":\"Overview\",\"link\":\"/services/\"},{\"text\":\"Storage Service\",\"link\":\"/services/storage-service\"},{\"text\":\"OCR Service\",\"link\":\"/services/ocr-service\"},{\"text\":\"IAM Service\",\"items\":[{\"text\":\"IAM Policies\",\"link\":\"/services/iam-service/iam-policy\"}]}]}]},\"locales\":{},\"scrollOffset\":134,\"cleanUrls\":false}");</script>
</body>
</html>