The extract_file
method returns a stream implementing std::io::Read
, allowing memory-efficient processing of large documents.
use std::io::{BufReader, Read};
use extractous::Extractor;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let extractor = Extractor::new();
let stream = extractor.extract_file("path/to/document.pdf")?;
// Create a buffered reader
let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
// Convert to string if needed
let content = String::from_utf8(buffer)?;
println!("{}", content);
Ok(())
}
Process the content in chunks for memory efficiency:
use std::io::{BufReader, Read};
use extractous::Extractor;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let extractor = Extractor::new();
let stream = extractor.extract_file("path/to/document.pdf")?;
let mut reader = BufReader::new(stream);
let mut buffer = [0; 1024]; // 1KB chunks
loop {
match reader.read(&mut buffer)? {
0 => break, // EOF
n => {
// Process chunk of size n
let chunk = &buffer[..n];
// Your processing logic here
}
}
}
Ok(())
}
The stream extraction supports the same configuration options as extract_file_to_string
:
use extractous::Extractor;
use extractous::PdfParserConfig;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let extractor = Extractor::new()
.set_pdf_config(
PdfParserConfig::new()
.set_extract_annotation_text(false)
);
let stream = extractor.extract_file("path/to/document.pdf")?;
// Process stream...
Ok(())
}