-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhocr.sh
executable file
·56 lines (43 loc) · 1.32 KB
/
hocr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash
# convert.sh
# argument1 = pdf
#
# TODO: This breaks if there are spaces in the file name...
#
# This file will create hOCR versions of PDF files. It will also generate PDF & TIFF versions of each page within the PDF
if [ ! -n "$1" ]
then
echo "ERROR: You must supply a file to convert"
echo "USAGE: `basename $0` file.pdf"
exit $E_BADARGS
fi
# absolute directory of script
DIR="$( cd "$( dirname "$0" )" && pwd )"
HOCR_DIR="${DIR}/processed/hocr"
CAPTCHAFY_DIR="${DIR}/processed/captchafy"
#TODO: check to see if directory exists for this pdf
#move into relative "processed" directory or something...
filename=$(basename $1)
extension=${filename##*.}
filename=${filename%.*}
HOCR_DIR="${HOCR_DIR}/${filename}"
CAPTCHAFY_DIR="${CAPTCHAFY_DIR}/${filename}"
rm -rf $HOCR_DIR
mkdir $HOCR_DIR
convert -density 300 -compress None -monochrome -median 1 $1 "$HOCR_DIR/${filename}%03d.tiff"
convert -density 300 $1 "$HOCR_DIR/${filename}%03d.png"
pushd $HOCR_DIR
for converted in $(ls -a *.tiff)
do
newfilename=$(basename $converted)
newextension=${newfilename##*.}
newfilename=${newfilename%.*}
# tesseract $converted ./hocr/${newfilename} -l eng ../hocr.config
tesseract $converted ${newfilename} -l eng ${DIR}/hocr.config
done;
popd
for hocr in $(ls -a ${HOCR_DIR}/*.html)
do
echo $hocr
casperjs render.js $hocr
done;