@inproceedings{2008-IUPR-25Jun_0906,
author = {Joost van Beusekom and Faisal Shafait and Thomas M. Breuel},
title = {Automated OCR Ground Truth Generation},
booktitle = {Proceedings of DAS 2008},
year = {2008},
month = {Sep},
note = {Accepted for publication},
pdf = {2008-IUPR-25Jun_0906.pdf},
__utma = {43439421.91645987.1202213052.1217229629.1218034898.34},
__utmz = {43439421.1208331358.12.3.utmccn=(referral)|utmcsr=dfki.de|utmcct=/web/research/iupr/staff/base_view|utmcmd=referral},
__utmc = {43439421},
abstract = {Most optical character recognition (OCR) systems need to be trained and tested on the symbols that are to be recognized.
Therefore, ground truth data is needed. This data consists of character images together with their ASCII code. Among the
approaches for generating ground truth of real world data, one promising technique is to use electronic version of the
scanned documents. Using an alignment method, the character bounding boxes extracted from the electronic document are
matched to the scanned image. Current alignment methods are not robust to different similarity transforms. They also
need calibration to deal with non-linear local distortions introduced by the printing/scanning process. In this paper we
present a significant improvement over existing methods, allowing to skip the calibration step and having a more
accurate alignment, under all similarity transforms. Our method finds a robust and pixel accurate scanner independent
alignment of the scanned image with the electronic document, allowing the extraction of accurate ground truth character
information. The accuracy of the alignment is demonstrated using documents from the UW3 dataset. The results show that
the mean distance between the estimated and the ground truth character bounding box position is less than one pixel.},
}
