On Thu, 2009-12-17 17:06:36 +0100, Jan-Benedict Glaw <jbglaw at lug-owl.de> wrote:
On Mon, 2009-12-14 14:15:30 +0000, Philip Pemberton
<classiccmp at philpem.me.uk> wrote:
[...]
Also, does anyone know of an app that can take
the PDF file, OCR it
and then insert the text as a background layer while leaving the
image alone? I'm pretty sure Acrobat can do this, but like most
Adobe software, the price tag is somewhat... eye-watering. "If you
have to ask how much it costs, you can't afford it."
The script may be used under the terms of the GNU General Public
License Version 3.
Hmpf, the script was stripped. Here it is, inline:
#!/usr/bin/env sh
NETPBM=pbm
TESSLANG=deu
HOCR2PDF=/home/jbglaw/paper/hocr2/HocrConverter.py
# 2 = pixel/inch, 600x600 DPI
TIFF_296=2
TIFF_282=600
TIFF_283=600
OUTDIR="__out__.$$"
rm -rf "${OUTDIR}"
mkdir "${OUTDIR}"
# Split
mkdir "${OUTDIR}/split"
for i in *.tiff; do
TIFFNAME="`basename "${i}" | sed -e 's/\.tiff$//'`"
tiffsplit "${i}" "${OUTDIR}/split/${TIFFNAME}-"
done
# Convert to bw/grey/colour NetPBM format for unpaper
mkdir "${OUTDIR}/singlepage_pbm"
for i in "${OUTDIR}"/split/*.tif; do
TIFFNAME="`basename "${i}" | sed -e 's/\.tif$//'`"
convert "${i}" "${OUTDIR}/singlepage_pbm/${TIFFNAME}.${NETPBM}"
done
# Beautify scans
mkdir "${OUTDIR}/singlepage_unpaper"
for i in "${OUTDIR}"/singlepage_pbm/*."${NETPBM}"; do
PNMNAME="`basename "${i}" | sed -e
's/\.'"${NETPBM}"'$//'`"
unpaper --pre-rotate 90 --input-pages 1 --output-pages 2 --layout double "${i}"
"${OUTDIR}/singlepage_unpaper/${PNMNAME}-%d.${NETPBM}"
done
# OCR all the stuff
mkdir "${OUTDIR}/hocr"
for i in "${OUTDIR}"/singlepage_unpaper/*."${NETPBM}"; do
PNMNAME="`basename "${i}" | sed -e
's/\.'"${NETPBM}"'$//'`"
tesslanguage="${TESSLANG}"
export tesslanguage
ocroscript rec-tess --tesslanguage="${TESSLANG}" "${i}" >
"${OUTDIR}/hocr/${PNMNAME}.hocr"
done
# Generate TIFFs from beautified NetBPM images
mkdir "${OUTDIR}/unpaper_tiffified"
for i in "${OUTDIR}"/singlepage_unpaper/*."${NETPBM}"; do
PNMNAME="`basename "${i}" | sed -e
's/\.'${NETPBM}'$//'`"
convert "${i}" "${OUTDIR}/unpaper_tiffified/${PNMNAME}.tiff"
tiffset -s 296 "${TIFF_296}"
"${OUTDIR}/unpaper_tiffified/${PNMNAME}.tiff"
tiffset -s 282 "${TIFF_282}"
"${OUTDIR}/unpaper_tiffified/${PNMNAME}.tiff"
tiffset -s 283 "${TIFF_283}"
"${OUTDIR}/unpaper_tiffified/${PNMNAME}.tiff"
done
read -p "Fix images now and press enter" FOO
# Generate single page PDF files
mkdir "${OUTDIR}/single_pdf"
for i in "${OUTDIR}/unpaper_tiffified"/*.tiff; do
TIFFNAME="`basename "${i}" | sed -e 's/\.tiff$//'`"
"${HOCR2PDF}" "${OUTDIR}/hocr/${TIFFNAME}.hocr" "${i}"
"${OUTDIR}/single_pdf/${TIFFNAME}.pdf"
done
# Generate final PDF
pdftk "${OUTDIR}/single_pdf"/*.pdf cat output "${OUTDIR}/book.pdf"
MfG, JBG
--
Jan-Benedict Glaw jbglaw at lug-owl.de +49-172-7608481
Signature of: Tr?ume nicht von Dein Leben: Lebe Deinen Traum!
the second :