Extractous supports Optical Character Recognition (OCR) through Tesseract integration, allowing text extraction from scanned documents and images.
Install Tesseract and required language packs:
# Install Tesseract
sudo apt install tesseract-ocr
# Install language packs (example: German and Arabic)
sudo apt install tesseract-ocr-deu tesseract-ocr-ara
use extractous::Extractor;
use extractous::TesseractOcrConfig;
use extractous::PdfParserConfig;
use extractous::PdfOcrStrategy;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let extractor = Extractor::new()
.set_ocr_config(
TesseractOcrConfig::new()
.set_language("deu")
)
.set_pdf_config(
PdfParserConfig::new()
.set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)
);
let content = extractor.extract_file_to_string("path/to/scanned.pdf")?;
Ok(())
}
use std::io::{BufReader, Read};
use extractous::Extractor;
use extractous::TesseractOcrConfig;
use extractous::PdfParserConfig;
use extractous::PdfOcrStrategy;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let extractor = Extractor::new()
.set_ocr_config(
TesseractOcrConfig::new()
.set_language("deu")
)
.set_pdf_config(
PdfParserConfig::new()
.set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)
);
let stream = extractor.extract_file("path/to/scanned.pdf")?;
let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
Ok(())
}
The PdfOcrStrategy enum provides three options:
// Try text extraction first, fall back to OCR if no text found
.set_ocr_strategy(PdfOcrStrategy::OCR_FALLBACK)
// Use OCR exclusively, ignore embedded text
.set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)
// Disable OCR completely
.set_ocr_strategy(PdfOcrStrategy::NO_OCR)
Configure OCR for multiple languages:
use extractous::Extractor;
use extractous::TesseractOcrConfig;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let extractor = Extractor::new()
.set_ocr_config(
TesseractOcrConfig::new()
.set_language("eng+deu+ara") // Multiple languages
);
let content = extractor.extract_file_to_string("path/to/document.pdf")?;
Ok(())
}