@inproceedings{f5dbf325d58941a688f07b66270cd441,
title = "Text Extraction for Complex Historical Documents: A Modular Approach to Layout Detection and OCR",
abstract = "We present a modular approach for high-precision extraction of data from retro-digitized historical texts with complex layouts. Our two-stage process combines AI-driven layout recognition using YOLOv9 with a fine-Tuned Kraken OCR engine. By leveraging synthetic training data and custom fonts, we achieve low single-digit Character Error Rates (CER) for 19th-century documents like the Schematismus. Our approach is particularly effective for processing large-scale historical collections with intricate layouts and nested structures, demonstrating significant improvements over existing solutions in both accuracy and processing efficiency. The systems modular design allows for easy adaptation to different historical document types while maintaining high performance levels. ",
keywords = "Historical documents, Information extraction, layout detection, OCR, Synthetic training data, YOLOv9",
author = "David Fleischhacker and G{\"o}derle, {Wolfgang Thomas} and Roman Kern",
year = "2025",
month = mar,
day = "13",
doi = "10.1145/3677389.3702524",
language = "English",
series = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries",
publisher = "IEEE",
editor = "Jian Wu and Xiao Hu and Terhi Nurmikko-Fuller and Sam Chu and Ruixian Yang and Downie, {J. Stephen}",
booktitle = "JCDL 2024 - Proceedings of the 24th ACM/IEEE Joint Conference on Digital Libraries",
address = "United States",
note = "24th ACM/IEEE Joint Conference on Digital Libraries, JCDL 2024, JCDL '24 ; Conference date: 16-12-2024 Through 20-12-2024",
}