This page lists all available configuration options for the extraction API. For a quick start guide with minimal configuration, see the Quickstart.
- Type:
string
- Required:
true
- Default:
"FAST_WITH_OCR"
- Options:
"FAST"
, "FAST_WITH_OCR"
, "ACCURATE"
, "ACCURATE_WITH_OCR"
Controls the extraction strategy and accuracy level.
FAST
: Quick extraction without OCRFAST_WITH_OCR
: Quick extraction with OCR for images and scanned documentsACCURATE
: Detailed extraction without OCRACCURATE_WITH_OCR
: Detailed extraction with OCR (highest accuracy, slower)
Configuration options specific to PDF document processing.
- Type:
string
- Default:
"AUTO"
- Options:
"AUTO"
, "FORCE"
, "DISABLE"
- Type:
boolean
- Default:
true
Extract text from PDF annotations.
- Type:
boolean
- Default:
false
Extract inline images from PDF documents.
- Type:
boolean
- Default:
false
Extract marked content from PDF documents.
- Type:
boolean
- Default:
false
Only extract unique inline images, avoiding duplicates.
Configuration options for OCR (Optical Character Recognition) processing.
- Type:
boolean
- Default:
false
Automatically rotate images for optimal OCR processing.
Image resolution in DPI for OCR processing.
Color depth for image processing.
- Type:
boolean
- Default:
false
Enable image preprocessing for improved OCR results.
- Type:
string
- Default:
"eng"
Language used for OCR processing.
Maximum time in seconds for OCR processing.
Configuration options for processing Microsoft Office documents.
- Type:
boolean
- Default:
true
Combine phonetic runs in text extraction.
- Type:
boolean
- Default:
false
Extract all alternative content from MSG files.
- Type:
boolean
- Default:
false
Extract macros from Office documents.
- Type:
boolean
- Default:
false
Include deleted/revised content in extraction.
- Type:
boolean
- Default:
true
Include headers and footers in extracted content.
- Type:
boolean
- Default:
false
Include empty/missing rows in table extraction.
- Type:
boolean
- Default:
false
Include moved content in extraction results.
- Type:
boolean
- Default:
true
Include content from shapes and text boxes.
- Type:
boolean
- Default:
true
Include slide master content in PowerPoint extractions.
- Type:
boolean
- Default:
true
Include slide notes in PowerPoint extractions.
These are example requests using all available configuration options. You can omit any options you don't need.
curl
python
javascript
rust
curl --request POST \
--url https://api.extractous.com/v1/extract \
--header 'Content-Type: multipart/form-data' \
--header 'X-Api-Key: YOUR_API_KEY' \
--form file='PATH_TO_YOUR_FILE' \
--form 'config[strategy]=FAST_WITH_OCR' \
--form 'config[pdf_config][ocr_strategy]=AUTO' \
--form 'config[pdf_config][extract_annotation_text]=true' \
--form 'config[pdf_config][extract_inline_images]=false' \
--form 'config[pdf_config][extract_marked_content]=false' \
--form 'config[pdf_config][extract_unique_inline_images_only]=false' \
--form 'config[ocr_config][apply_rotation]=false' \
--form 'config[ocr_config][density]=300' \
--form 'config[ocr_config][depth]=8' \
--form 'config[ocr_config][enable_image_preprocessing]=false' \
--form 'config[ocr_config][language]=eng' \
--form 'config[ocr_config][timeout_seconds]=120' \
--form 'config[office_config][concatenate_phonetic_runs]=true' \
--form 'config[office_config][extract_all_alternatives_from_msg]=false' \
--form 'config[office_config][extract_macros]=false' \
--form 'config[office_config][include_deleted_content]=false' \
--form 'config[office_config][include_headers_and_footers]=true' \
--form 'config[office_config][include_missing_rows]=false' \
--form 'config[office_config][include_move_from_content]=false' \
--form 'config[office_config][include_shape_based_content]=true' \
--form 'config[office_config][include_slide_master_content]=true' \
--form 'config[office_config][include_slide_notes]=true'
import requests
import json
def extract_document(file_path, api_key):
url = "https://api.extractous.com/v1/extract"
config = {
"strategy": "FAST_WITH_OCR",
"pdf_config": {
"ocr_strategy": "AUTO",
"extract_annotation_text": True,
"extract_inline_images": False,
"extract_marked_content": False,
"extract_unique_inline_images_only": False
},
"ocr_config": {
"apply_rotation": False,
"density": 300,
"depth": 8,
"enable_image_preprocessing": False,
"language": "eng",
"timeout_seconds": 120
},
"office_config": {
"concatenate_phonetic_runs": True,
"extract_all_alternatives_from_msg": False,
"extract_macros": False,
"include_deleted_content": False,
"include_headers_and_footers": True,
"include_missing_rows": False,
"include_move_from_content": False,
"include_shape_based_content": True,
"include_slide_master_content": True,
"include_slide_notes": True
}
}
files = {
'file': open(file_path, 'rb'),
'config': (None, json.dumps(config))
}
headers = {
'X-Api-Key': api_key
}
response = requests.post(url, headers=headers, files=files)
return response.json()
const FormData = require('form-data');
const fs = require('fs');
const axios = require('axios');
async function extractDocument(filePath, apiKey) {
const url = 'https://api.extractous.com/v1/extract';
const config = {
strategy: 'FAST_WITH_OCR',
pdf_config: {
ocr_strategy: 'AUTO',
extract_annotation_text: true,
extract_inline_images: false,
extract_marked_content: false,
extract_unique_inline_images_only: false,
},
ocr_config: {
apply_rotation: false,
density: 300,
depth: 8,
enable_image_preprocessing: false,
language: 'eng',
timeout_seconds: 120,
},
office_config: {
concatenate_phonetic_runs: true,
extract_all_alternatives_from_msg: false,
extract_macros: false,
include_deleted_content: false,
include_headers_and_footers: true,
include_missing_rows: false,
include_move_from_content: false,
include_shape_based_content: true,
include_slide_master_content: true,
include_slide_notes: true,
},
};
const form = new FormData();
form.append('file', fs.createReadStream(filePath));
form.append('config', JSON.stringify(config));
const response = await axios.post(url, form, {
headers: {
...form.getHeaders(),
'X-Api-Key': apiKey,
},
});
return response.data;
}
use reqwest::multipart::{Form, Part};
use serde_json::json;
use tokio::fs::File;
use tokio_util::codec::{BytesCodec, FramedRead};
async fn extract_document(file_path: &str, api_key: &str) -> Result<String, Box<dyn std::error::Error>> {
let url = "https://api.extractous.com/v1/extract";
// Create config JSON
let config = json!({
"strategy": "FAST_WITH_OCR",
"pdf_config": {
"ocr_strategy": "AUTO",
"extract_annotation_text": true,
"extract_inline_images": false,
"extract_marked_content": false,
"extract_unique_inline_images_only": false
},
"office_config": {
"concatenate_phonetic_runs": true,
"extract_all_alternatives_from_msg": false,
"extract_macros": false,
"include_deleted_content": false,
"include_headers_and_footers": true,
"include_missing_rows": false,
"include_move_from_content": false,
"include_shape_based_content": true,
"include_slide_master_content": true,
"include_slide_notes": true
},
"ocr_config": {
"apply_rotation": false,
"density": 300,
"depth": 8,
"enable_image_preprocessing": false,
"language": "eng",
"timeout_seconds": 120
}
});
// Open file stream
let file = File::open(file_path).await?;
let stream = FramedRead::new(file, BytesCodec::new());
let file_part = Part::stream(reqwest::Body::wrap_stream(stream));
// Create multipart form
let form = Form::new()
.part("file", file_part)
.text("config", config.to_string());
// Make request
let client = reqwest::Client::new();
let response = client
.post(url)
.header("X-Api-Key", api_key)
.multipart(form)
.send()
.await?;
Ok(response.text().await?)
}
// Usage example
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let result = extract_document("path/to/file", "YOUR_API_KEY").await?;
println!("{}", result);
Ok(())
}