Files
OpenArchiver/services/ocr-service.html
2026-03-30 20:29:40 +00:00

31 lines
36 KiB
HTML
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en-US" dir="ltr">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>OCR Service | Open Archiver Docs</title>
<meta name="description" content="Official documentation for the Open Archiver project.">
<meta name="generator" content="VitePress v1.6.4">
<link rel="preload stylesheet" href="/assets/style.BRVqMv4m.css" as="style">
<link rel="preload stylesheet" href="/vp-icons.css" as="style">
<script type="module" src="/assets/app.BneHbokF.js"></script>
<link rel="preload" href="/assets/inter-roman-latin.Di8DUHzh.woff2" as="font" type="font/woff2" crossorigin="">
<link rel="modulepreload" href="/assets/chunks/theme.DMwvsXz4.js">
<link rel="modulepreload" href="/assets/chunks/framework.DHqKNX4U.js">
<link rel="modulepreload" href="/assets/services_ocr-service.md.B6-nFSTJ.lean.js">
<script defer src="https://analytics.openarchiver.com/script.js" data-website-id="2c8b452e-eab5-4f82-8ead-902d8f8b976f"></script>
<link rel="icon" href="/logo-sq.svg">
<script id="check-dark-mode">(()=>{const e=localStorage.getItem("vitepress-theme-appearance")||"auto",a=window.matchMedia("(prefers-color-scheme: dark)").matches;(!e||e==="auto"?a:e==="dark")&&document.documentElement.classList.add("dark")})();</script>
<script id="check-mac-os">document.documentElement.classList.toggle("mac",/Mac|iPhone|iPod|iPad/i.test(navigator.platform));</script>
</head>
<body>
<div id="app"><div class="Layout" data-v-6d457d7f><!--[--><!--]--><!--[--><span tabindex="-1" data-v-9f15b6e6></span><a href="#VPContent" class="VPSkipLink visually-hidden" data-v-9f15b6e6>Skip to content</a><!--]--><!----><header class="VPNav" data-v-6d457d7f data-v-06aeb22c><div class="VPNavBar" data-v-06aeb22c data-v-9f44cfca><div class="wrapper" data-v-9f44cfca><div class="container" data-v-9f44cfca><div class="title" data-v-9f44cfca><div class="VPNavBarTitle has-sidebar" data-v-9f44cfca data-v-cea99ba2><a class="title" href="/" data-v-cea99ba2><!--[--><!--]--><!--[--><img class="VPImage logo" src="/logo-sq.svg" alt data-v-384abc6c><!--]--><span data-v-cea99ba2>Open Archiver Docs</span><!--[--><!--]--></a></div></div><div class="content" data-v-9f44cfca><div class="content-body" data-v-9f44cfca><!--[--><!--]--><div class="VPNavBarSearch search" data-v-9f44cfca><!--[--><!----><div id="local-search"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><span class="vp-icon DocSearch-Search-Icon"></span><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"><kbd class="DocSearch-Button-Key"></kbd><kbd class="DocSearch-Button-Key">K</kbd></span></button></div><!--]--></div><nav aria-labelledby="main-nav-aria-label" class="VPNavBarMenu menu" data-v-9f44cfca data-v-bc890e2b><span id="main-nav-aria-label" class="visually-hidden" data-v-bc890e2b> Main Navigation </span><!--[--><!--[--><a class="VPLink link VPNavBarMenuLink" href="/" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Home</span><!--]--></a><!--]--><!--[--><a class="VPLink link vp-external-link-icon VPNavBarMenuLink" href="https://github.com/LogicLabs-OU/OpenArchiver" target="_blank" rel="noreferrer" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Github</span><!--]--></a><!--]--><!--[--><a class="VPLink link vp-external-link-icon VPNavBarMenuLink" href="https://openarchiver.com/" target="_blank" rel="noreferrer" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Website</span><!--]--></a><!--]--><!--[--><a class="VPLink link vp-external-link-icon VPNavBarMenuLink" href="https://discord.gg/MTtD7BhuTQ" target="_blank" rel="noreferrer" tabindex="0" data-v-bc890e2b data-v-358f64d2><!--[--><span data-v-358f64d2>Discord</span><!--]--></a><!--]--><!--]--></nav><!----><div class="VPNavBarAppearance appearance" data-v-9f44cfca data-v-1a8fd632><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-1a8fd632 data-v-77125d61 data-v-44c0c9c3><span class="check" data-v-44c0c9c3><span class="icon" data-v-44c0c9c3><!--[--><span class="vpi-sun sun" data-v-77125d61></span><span class="vpi-moon moon" data-v-77125d61></span><!--]--></span></span></button></div><!----><div class="VPFlyout VPNavBarExtra extra" data-v-9f44cfca data-v-cb22ae83 data-v-ce17dc3c><button type="button" class="button" aria-haspopup="true" aria-expanded="false" aria-label="extra navigation" data-v-ce17dc3c><span class="vpi-more-horizontal icon" data-v-ce17dc3c></span></button><div class="menu" data-v-ce17dc3c><div class="VPMenu" data-v-ce17dc3c data-v-a0929307><!----><!--[--><!--[--><!----><div class="group" data-v-cb22ae83><div class="item appearance" data-v-cb22ae83><p class="label" data-v-cb22ae83>Appearance</p><div class="appearance-action" data-v-cb22ae83><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-cb22ae83 data-v-77125d61 data-v-44c0c9c3><span class="check" data-v-44c0c9c3><span class="icon" data-v-44c0c9c3><!--[--><span class="vpi-sun sun" data-v-77125d61></span><span class="vpi-moon moon" data-v-77125d61></span><!--]--></span></span></button></div></div></div><!----><!--]--><!--]--></div></div></div><!--[--><!--]--><button type="button" class="VPNavBarHamburger hamburger" aria-label="mobile navigation" aria-expanded="false" aria-controls="VPNavScreen" data-v-9f44cfca data-v-42f375ed><span class="container" data-v-42f375ed><span class="top" data-v-42f375ed></span><span class="middle" data-v-42f375ed></span><span class="bottom" data-v-42f375ed></span></span></button></div></div></div></div><div class="divider" data-v-9f44cfca><div class="divider-line" data-v-9f44cfca></div></div></div><!----></header><div class="VPLocalNav has-sidebar empty" data-v-6d457d7f data-v-e17e33c6><div class="container" data-v-e17e33c6><button class="menu" aria-expanded="false" aria-controls="VPSidebarNav" data-v-e17e33c6><span class="vpi-align-left menu-icon" data-v-e17e33c6></span><span class="menu-text" data-v-e17e33c6>Menu</span></button><div class="VPLocalNavOutlineDropdown" style="--vp-vh:0px;" data-v-e17e33c6 data-v-bab32157><button data-v-bab32157>Return to top</button><!----></div></div></div><aside class="VPSidebar" data-v-6d457d7f data-v-91b447a8><div class="curtain" data-v-91b447a8></div><nav class="nav" id="VPSidebarNav" aria-labelledby="sidebar-aria-label" tabindex="-1" data-v-91b447a8><span class="visually-hidden" id="sidebar-aria-label" data-v-91b447a8> Sidebar Navigation </span><!--[--><!--]--><!--[--><div class="no-transition group" data-v-6b998b36><section class="VPSidebarItem level-0" data-v-6b998b36 data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h2 class="text" data-v-5f79b55d>User Guides</h2><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Get Started</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/installation.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Installation</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/integrity-check.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Email Integrity Check</p><!--]--></a><!----></div><!----></div><section class="VPSidebarItem level-1 collapsible collapsed is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/" data-v-5f79b55d><!--[--><h3 class="text" data-v-5f79b55d>Email Providers</h3><!--]--></a><div class="caret" role="button" aria-label="toggle section" tabindex="0" data-v-5f79b55d><span class="vpi-chevron-right caret-icon" data-v-5f79b55d></span></div></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/imap.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Generic IMAP Server</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/google-workspace.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Google Workspace</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/microsoft-365.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Microsoft 365</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/eml.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>EML Import</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/pst.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>PST Import</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/email-providers/mbox.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Mbox Import</p><!--]--></a><!----></div><!----></div><!--]--></div></section><section class="VPSidebarItem level-1 collapsible collapsed" data-v-5f79b55d data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h3 class="text" data-v-5f79b55d>Settings</h3><div class="caret" role="button" aria-label="toggle section" tabindex="0" data-v-5f79b55d><span class="vpi-chevron-right caret-icon" data-v-5f79b55d></span></div></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/settings/system.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>System</p><!--]--></a><!----></div><!----></div><!--]--></div></section><section class="VPSidebarItem level-1 collapsible collapsed" data-v-5f79b55d data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h3 class="text" data-v-5f79b55d>Upgrading and Migration</h3><div class="caret" role="button" aria-label="toggle section" tabindex="0" data-v-5f79b55d><span class="vpi-chevron-right caret-icon" data-v-5f79b55d></span></div></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/upgrade-and-migration/upgrade.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Upgrading</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/user-guides/upgrade-and-migration/meilisearch-upgrade.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Meilisearch Upgrade</p><!--]--></a><!----></div><!----></div><!--]--></div></section><!--]--></div></section></div><div class="no-transition group" data-v-6b998b36><section class="VPSidebarItem level-0" data-v-6b998b36 data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h2 class="text" data-v-5f79b55d>API Reference</h2><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Overview</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/authentication.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Authentication</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/rate-limiting.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Rate Limiting</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/auth.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Auth</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/archived-email.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Archived Email</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/dashboard.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Dashboard</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/ingestion.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Ingestion</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/integrity.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Integrity Check</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/search.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Search</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/storage.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Storage</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/upload.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Upload</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/jobs.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Jobs</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/users.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Users</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/iam.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>IAM</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/api-keys.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>API Keys</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/api/settings.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Settings</p><!--]--></a><!----></div><!----></div><!--]--></div></section></div><div class="no-transition group" data-v-6b998b36><section class="VPSidebarItem level-0 has-active" data-v-6b998b36 data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h2 class="text" data-v-5f79b55d>Services</h2><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Overview</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/storage-service.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>Storage Service</p><!--]--></a><!----></div><!----></div><div class="VPSidebarItem level-1 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/ocr-service.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>OCR Service</p><!--]--></a><!----></div><!----></div><section class="VPSidebarItem level-1" data-v-5f79b55d data-v-5f79b55d><div class="item" role="button" tabindex="0" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><h3 class="text" data-v-5f79b55d>IAM Service</h3><!----></div><div class="items" data-v-5f79b55d><!--[--><div class="VPSidebarItem level-2 is-link" data-v-5f79b55d data-v-5f79b55d><div class="item" data-v-5f79b55d><div class="indicator" data-v-5f79b55d></div><a class="VPLink link link" href="/services/iam-service/iam-policy.html" data-v-5f79b55d><!--[--><p class="text" data-v-5f79b55d>IAM Policies</p><!--]--></a><!----></div><!----></div><!--]--></div></section><!--]--></div></section></div><!--]--><!--[--><!--]--></nav></aside><div class="VPContent has-sidebar" id="VPContent" data-v-6d457d7f data-v-413f3f32><div class="VPDoc has-sidebar has-aside" data-v-413f3f32 data-v-29334b8c><!--[--><!--]--><div class="container" data-v-29334b8c><div class="aside" data-v-29334b8c><div class="aside-curtain" data-v-29334b8c></div><div class="aside-container" data-v-29334b8c><div class="aside-content" data-v-29334b8c><div class="VPDocAside" data-v-29334b8c data-v-0970baee><!--[--><!--]--><!--[--><!--]--><nav aria-labelledby="doc-outline-aria-label" class="VPDocAsideOutline" data-v-0970baee data-v-1cf7166c><div class="content" data-v-1cf7166c><div class="outline-marker" data-v-1cf7166c></div><div aria-level="2" class="outline-title" id="doc-outline-aria-label" role="heading" data-v-1cf7166c>On this page</div><ul class="VPDocOutlineItem root" data-v-1cf7166c data-v-c6ed6775><!--[--><!--]--></ul></div></nav><!--[--><!--]--><div class="spacer" data-v-0970baee></div><!--[--><!--]--><!----><!--[--><!--]--><!--[--><!--]--></div></div></div></div><div class="content" data-v-29334b8c><div class="content-container" data-v-29334b8c><!--[--><!--]--><main class="main" data-v-29334b8c><div style="position:relative;" class="vp-doc _services_ocr-service" data-v-29334b8c><div><h1 id="ocr-service" tabindex="-1">OCR Service <a class="header-anchor" href="#ocr-service" aria-label="Permalink to &quot;OCR Service&quot;"></a></h1><p>The OCR (Optical Character Recognition) and text extraction service is responsible for extracting plain text content from various file formats, such as PDFs, Office documents, and more. This is a crucial component for making email attachments searchable.</p><h2 id="overview" tabindex="-1">Overview <a class="header-anchor" href="#overview" aria-label="Permalink to &quot;Overview&quot;"></a></h2><p>The system employs a two-pronged approach for text extraction:</p><ol><li><strong>Primary Extractor (Apache Tika)</strong>: A powerful and versatile toolkit that can extract text from a wide variety of file formats. It is the recommended method for its superior performance and format support.</li><li><strong>Legacy Extractor</strong>: A fallback mechanism that uses a combination of libraries (<code>pdf2json</code>, <code>mammoth</code>, <code>xlsx</code>) for common file types like PDF, DOCX, and XLSX. This is used when Apache Tika is not configured.</li></ol><p>The main logic resides in <code>packages/backend/src/helpers/textExtractor.ts</code>, which decides which extraction method to use based on the application&#39;s configuration.</p><h2 id="configuration" tabindex="-1">Configuration <a class="header-anchor" href="#configuration" aria-label="Permalink to &quot;Configuration&quot;"></a></h2><p>To enable the primary text extraction method, you must configure the URL of an Apache Tika server instance in your environment variables.</p><p>In your <code>.env</code> file, set the <code>TIKA_URL</code>:</p><div class="language-env vp-adaptive-theme"><button title="Copy Code" class="copy"></button><span class="lang">env</span><pre class="shiki shiki-themes github-light github-dark vp-code" tabindex="0"><code><span class="line"><span># .env.example</span></span>
<span class="line"><span></span></span>
<span class="line"><span># Apache Tika Integration</span></span>
<span class="line"><span># ONLY active if TIKA_URL is set</span></span>
<span class="line"><span>TIKA_URL=http://tika:9998</span></span></code></pre></div><p>If <code>TIKA_URL</code> is not set, the system will automatically fall back to the legacy extraction methods. The service performs a health check on startup to verify connectivity with the Tika server.</p><h2 id="file-size-limits" tabindex="-1">File Size Limits <a class="header-anchor" href="#file-size-limits" aria-label="Permalink to &quot;File Size Limits&quot;"></a></h2><p>To prevent excessive memory usage and processing time, the service imposes a general size limit on files submitted for text extraction. Files larger than the configured limit will be skipped.</p><ul><li><strong>With Apache Tika</strong>: The maximum file size is <strong>100MB</strong>.</li><li><strong>With Legacy Fallback</strong>: The maximum file size is <strong>50MB</strong>.</li></ul><h2 id="supported-file-formats" tabindex="-1">Supported File Formats <a class="header-anchor" href="#supported-file-formats" aria-label="Permalink to &quot;Supported File Formats&quot;"></a></h2><p>The service&#39;s ability to extract text depends on whether it&#39;s using Apache Tika or the legacy fallback methods.</p><h3 id="with-apache-tika" tabindex="-1">With Apache Tika <a class="header-anchor" href="#with-apache-tika" aria-label="Permalink to &quot;With Apache Tika&quot;"></a></h3><p>When <code>TIKA_URL</code> is configured, the service can process a vast range of file formats. Apache Tika is designed for broad compatibility and supports hundreds of file types, including but not limited to:</p><ul><li>Portable Document Format (PDF)</li><li>Microsoft Office formats (DOC, DOCX, PPT, PPTX, XLS, XLSX)</li><li>OpenDocument Formats (ODT, ODS, ODP)</li><li>Rich Text Format (RTF)</li><li>Plain Text (TXT, CSV, JSON, XML, HTML)</li><li>Image formats with OCR capabilities (PNG, JPEG, TIFF)</li><li>Archive formats (ZIP, TAR, GZ)</li><li>Email formats (EML, MSG)</li></ul><p>For a complete and up-to-date list, please refer to the official <a href="https://tika.apache.org/3.2.3/formats.html" target="_blank" rel="noreferrer">Apache Tika documentation</a>.</p><h3 id="with-legacy-fallback" tabindex="-1">With Legacy Fallback <a class="header-anchor" href="#with-legacy-fallback" aria-label="Permalink to &quot;With Legacy Fallback&quot;"></a></h3><p>When Tika is not configured, text extraction is limited to the following formats:</p><ul><li><code>application/pdf</code> (PDF)</li><li><code>application/vnd.openxmlformats-officedocument.wordprocessingml.document</code> (DOCX)</li><li><code>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</code> (XLSX)</li><li>Plain text formats such as <code>text/*</code>, <code>application/json</code>, and <code>application/xml</code>.</li></ul><h2 id="features-of-the-tika-integration-ocrservice" tabindex="-1">Features of the Tika Integration (<code>OcrService</code>) <a class="header-anchor" href="#features-of-the-tika-integration-ocrservice" aria-label="Permalink to &quot;Features of the Tika Integration (`OcrService`)&quot;"></a></h2><p>The <code>OcrService</code> (<code>packages/backend/src/services/OcrService.ts</code>) provides several enhancements to make text extraction efficient and robust.</p><h3 id="caching" tabindex="-1">Caching <a class="header-anchor" href="#caching" aria-label="Permalink to &quot;Caching&quot;"></a></h3><p>To avoid redundant processing of the same file, the service implements a simple LRU (Least Recently Used) cache.</p><ul><li><strong>Cache Key</strong>: A SHA-256 hash of the file&#39;s buffer is used as the cache key.</li><li><strong>Functionality</strong>: If a file with the same hash is processed again, the text content is served directly from the cache, saving significant processing time.</li><li><strong>Statistics</strong>: The service keeps track of cache hits, misses, and the hit rate for performance monitoring.</li></ul><h3 id="concurrency-management-semaphore" tabindex="-1">Concurrency Management (Semaphore) <a class="header-anchor" href="#concurrency-management-semaphore" aria-label="Permalink to &quot;Concurrency Management (Semaphore)&quot;"></a></h3><p>Extracting text from large files can be resource-intensive. To prevent the Tika server from being overwhelmed by multiple requests for the <em>same file</em> simultaneously (e.g., during a large import), a semaphore mechanism is used.</p><ul><li><strong>Functionality</strong>: If a request for a specific file (identified by its hash) is already in progress, any subsequent requests for the same file will wait for the first one to complete and then use its result.</li><li><strong>Benefit</strong>: This deduplicates parallel processing efforts and reduces unnecessary load on the Tika server.</li></ul><h3 id="health-check-and-dns-fallback" tabindex="-1">Health Check and DNS Fallback <a class="header-anchor" href="#health-check-and-dns-fallback" aria-label="Permalink to &quot;Health Check and DNS Fallback&quot;"></a></h3><ul><li><strong>Availability Check</strong>: The service includes a <code>checkTikaAvailability</code> method to verify that the Tika server is reachable and operational. This check is performed on application startup.</li><li><strong>DNS Fallback</strong>: For convenience in Docker environments, if the Tika URL uses the hostname <code>tika</code> (e.g., <code>http://tika:9998</code>), the service will automatically attempt a fallback to <code>localhost</code> if the initial connection fails.</li></ul><h2 id="legacy-fallback-methods" tabindex="-1">Legacy Fallback Methods <a class="header-anchor" href="#legacy-fallback-methods" aria-label="Permalink to &quot;Legacy Fallback Methods&quot;"></a></h2><p>When Tika is not available, the <code>extractTextLegacy</code> function in <code>textExtractor.ts</code> handles extraction for a limited set of MIME types:</p><ul><li><code>application/pdf</code>: Processed using <code>pdf2json</code>. Includes a 50MB size limit and a 5-second timeout to prevent memory issues.</li><li><code>application/vnd.openxmlformats-officedocument.wordprocessingml.document</code> (DOCX): Processed using <code>mammoth</code>.</li><li><code>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</code> (XLSX): Processed using <code>xlsx</code>.</li><li>Plain text formats (<code>text/*</code>, <code>application/json</code>, <code>application/xml</code>): Converted directly from the buffer.</li></ul></div></div></main><footer class="VPDocFooter" data-v-29334b8c data-v-a014c1de><!--[--><!--]--><!----><nav class="prev-next" aria-labelledby="doc-footer-aria-label" data-v-a014c1de><span class="visually-hidden" id="doc-footer-aria-label" data-v-a014c1de>Pager</span><div class="pager" data-v-a014c1de><a class="VPLink link pager-link prev" href="/services/storage-service.html" data-v-a014c1de><!--[--><span class="desc" data-v-a014c1de>Previous page</span><span class="title" data-v-a014c1de>Storage Service</span><!--]--></a></div><div class="pager" data-v-a014c1de><a class="VPLink link pager-link next" href="/services/iam-service/iam-policy.html" data-v-a014c1de><!--[--><span class="desc" data-v-a014c1de>Next page</span><span class="title" data-v-a014c1de>IAM Policies</span><!--]--></a></div></nav></footer><!--[--><!--]--></div></div></div><!--[--><!--]--></div></div><!----><!--[--><!--]--></div></div>
<script>window.__VP_HASH_MAP__=JSON.parse("{\"api_api-keys.md\":\"Dn21HeSJ\",\"api_archived-email.md\":\"BT-mHBqK\",\"api_auth.md\":\"D-468dKx\",\"api_authentication.md\":\"3ZX-rlxc\",\"api_dashboard.md\":\"CSiI_2IK\",\"api_iam.md\":\"B5FbdpjA\",\"api_index.md\":\"B_r30QlK\",\"api_ingestion.md\":\"D362LBU_\",\"api_integrity.md\":\"B9VpMKD6\",\"api_jobs.md\":\"CTasZcNL\",\"api_rate-limiting.md\":\"BqhkXaUd\",\"api_search.md\":\"Cu20yO7t\",\"api_settings.md\":\"CwhPvkZf\",\"api_storage.md\":\"CTn9PQaD\",\"api_upload.md\":\"CfCXTOWM\",\"api_users.md\":\"BdQul9nH\",\"enterprise_audit-log_api.md\":\"jQPQ8cO8\",\"enterprise_audit-log_audit-service.md\":\"D9higq7j\",\"enterprise_audit-log_guide.md\":\"DuHTEaId\",\"enterprise_audit-log_index.md\":\"By894rqA\",\"enterprise_legal-holds_api.md\":\"E6pKaGoK\",\"enterprise_legal-holds_guide.md\":\"4JCnApXO\",\"enterprise_legal-holds_index.md\":\"CXivackd\",\"enterprise_retention-labels_api.md\":\"De0lRuGD\",\"enterprise_retention-labels_automated-tagging.md\":\"uMCViAe6\",\"enterprise_retention-labels_guide.md\":\"LMnSU6ly\",\"enterprise_retention-labels_index.md\":\"gaps1eMo\",\"enterprise_retention-policy_api.md\":\"Dv1_ca5K\",\"enterprise_retention-policy_guide.md\":\"BxhZekoC\",\"enterprise_retention-policy_index.md\":\"C_qinQpQ\",\"enterprise_retention-policy_lifecycle-worker.md\":\"CLFQ4ZEf\",\"enterprise_retention-policy_retention-service.md\":\"0HsJnlOP\",\"index.md\":\"ChFzL1lA\",\"services_iam-service_iam-policy.md\":\"DSOajx7I\",\"services_index.md\":\"BmWnT3gS\",\"services_job-queue.md\":\"_FwJJttJ\",\"services_ocr-service.md\":\"B6-nFSTJ\",\"services_storage-service.md\":\"Bpdj_lG7\",\"summary.md\":\"C_r3wuBH\",\"user-guides_email-providers_eml.md\":\"DV6Zs6RI\",\"user-guides_email-providers_google-workspace.md\":\"CaNRUOL2\",\"user-guides_email-providers_imap.md\":\"CQ8xQOnx\",\"user-guides_email-providers_index.md\":\"D2pQ89-G\",\"user-guides_email-providers_mbox.md\":\"B68Wejyx\",\"user-guides_email-providers_merging-sources.md\":\"D0DI--4D\",\"user-guides_email-providers_microsoft-365.md\":\"BNGY0WTw\",\"user-guides_email-providers_pst.md\":\"Dech_zDS\",\"user-guides_installation.md\":\"CYbox6PW\",\"user-guides_integrity-check.md\":\"BfM1nsWe\",\"user-guides_settings_system.md\":\"BI9F6Lpw\",\"user-guides_troubleshooting_cors-errors.md\":\"wu2DtNUw\",\"user-guides_upgrade-and-migration_meilisearch-upgrade.md\":\"BTQMSseX\",\"user-guides_upgrade-and-migration_upgrade.md\":\"DlOu8Pon\"}");window.__VP_SITE_DATA__=JSON.parse("{\"lang\":\"en-US\",\"dir\":\"ltr\",\"title\":\"Open Archiver Docs\",\"description\":\"Official documentation for the Open Archiver project.\",\"base\":\"/\",\"head\":[],\"router\":{\"prefetchLinks\":true},\"appearance\":true,\"themeConfig\":{\"search\":{\"provider\":\"local\"},\"logo\":{\"src\":\"/logo-sq.svg\"},\"nav\":[{\"text\":\"Home\",\"link\":\"/\"},{\"text\":\"Github\",\"link\":\"https://github.com/LogicLabs-OU/OpenArchiver\"},{\"text\":\"Website\",\"link\":\"https://openarchiver.com/\"},{\"text\":\"Discord\",\"link\":\"https://discord.gg/MTtD7BhuTQ\"}],\"sidebar\":[{\"text\":\"User Guides\",\"items\":[{\"text\":\"Get Started\",\"link\":\"/\"},{\"text\":\"Installation\",\"link\":\"/user-guides/installation\"},{\"text\":\"Email Integrity Check\",\"link\":\"/user-guides/integrity-check\"},{\"text\":\"Email Providers\",\"link\":\"/user-guides/email-providers/\",\"collapsed\":true,\"items\":[{\"text\":\"Generic IMAP Server\",\"link\":\"/user-guides/email-providers/imap\"},{\"text\":\"Google Workspace\",\"link\":\"/user-guides/email-providers/google-workspace\"},{\"text\":\"Microsoft 365\",\"link\":\"/user-guides/email-providers/microsoft-365\"},{\"text\":\"EML Import\",\"link\":\"/user-guides/email-providers/eml\"},{\"text\":\"PST Import\",\"link\":\"/user-guides/email-providers/pst\"},{\"text\":\"Mbox Import\",\"link\":\"/user-guides/email-providers/mbox\"}]},{\"text\":\"Settings\",\"collapsed\":true,\"items\":[{\"text\":\"System\",\"link\":\"/user-guides/settings/system\"}]},{\"text\":\"Upgrading and Migration\",\"collapsed\":true,\"items\":[{\"text\":\"Upgrading\",\"link\":\"/user-guides/upgrade-and-migration/upgrade\"},{\"text\":\"Meilisearch Upgrade\",\"link\":\"/user-guides/upgrade-and-migration/meilisearch-upgrade\"}]}]},{\"text\":\"API Reference\",\"items\":[{\"text\":\"Overview\",\"link\":\"/api/\"},{\"text\":\"Authentication\",\"link\":\"/api/authentication\"},{\"text\":\"Rate Limiting\",\"link\":\"/api/rate-limiting\"},{\"text\":\"Auth\",\"link\":\"/api/auth\"},{\"text\":\"Archived Email\",\"link\":\"/api/archived-email\"},{\"text\":\"Dashboard\",\"link\":\"/api/dashboard\"},{\"text\":\"Ingestion\",\"link\":\"/api/ingestion\"},{\"text\":\"Integrity Check\",\"link\":\"/api/integrity\"},{\"text\":\"Search\",\"link\":\"/api/search\"},{\"text\":\"Storage\",\"link\":\"/api/storage\"},{\"text\":\"Upload\",\"link\":\"/api/upload\"},{\"text\":\"Jobs\",\"link\":\"/api/jobs\"},{\"text\":\"Users\",\"link\":\"/api/users\"},{\"text\":\"IAM\",\"link\":\"/api/iam\"},{\"text\":\"API Keys\",\"link\":\"/api/api-keys\"},{\"text\":\"Settings\",\"link\":\"/api/settings\"}]},{\"text\":\"Services\",\"items\":[{\"text\":\"Overview\",\"link\":\"/services/\"},{\"text\":\"Storage Service\",\"link\":\"/services/storage-service\"},{\"text\":\"OCR Service\",\"link\":\"/services/ocr-service\"},{\"text\":\"IAM Service\",\"items\":[{\"text\":\"IAM Policies\",\"link\":\"/services/iam-service/iam-policy\"}]}]}]},\"locales\":{},\"scrollOffset\":134,\"cleanUrls\":false}");</script>
</body>
</html>