Improve word summary when doing OCR.
This commit is contained in:
parent
cb2c50f973
commit
fd8bd1da18
55
scan
55
scan
|
@ -115,7 +115,7 @@ function usage() {
|
||||||
|
|
||||||
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
|
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
|
||||||
OCRPROG="$OCRPROG_DEF"
|
OCRPROG="$OCRPROG_DEF"
|
||||||
P2TPROG="pdftotext"
|
P2TPROG="/opt/homebrew/bin/pdftotext"
|
||||||
DUPLEXOPTS=""
|
DUPLEXOPTS=""
|
||||||
MODE="scan"
|
MODE="scan"
|
||||||
DIR=${PDFDIR:-"~/Documents"}
|
DIR=${PDFDIR:-"~/Documents"}
|
||||||
|
@ -131,7 +131,7 @@ TEST=0
|
||||||
FINYEARTAGS=""
|
FINYEARTAGS=""
|
||||||
VERBOSE=0
|
VERBOSE=0
|
||||||
|
|
||||||
ALLARGS="$@"
|
ALLARGS=""
|
||||||
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
||||||
notify "processing $RCFILE"
|
notify "processing $RCFILE"
|
||||||
|
|
||||||
|
@ -149,8 +149,10 @@ if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
||||||
ok
|
ok
|
||||||
fi
|
fi
|
||||||
|
|
||||||
validargs="cdf:hlmMops:Ttv"
|
function processarg() {
|
||||||
while getopts "$validargs" i $ALLARGS; do
|
local i=$1
|
||||||
|
local arg=$2
|
||||||
|
|
||||||
case "$i" in
|
case "$i" in
|
||||||
v)
|
v)
|
||||||
VERBOSE=1;
|
VERBOSE=1;
|
||||||
|
@ -182,16 +184,17 @@ while getopts "$validargs" i $ALLARGS; do
|
||||||
;;
|
;;
|
||||||
o)
|
o)
|
||||||
OCR=1;
|
OCR=1;
|
||||||
|
info "OCR mode enabled"
|
||||||
;;
|
;;
|
||||||
O)
|
O)
|
||||||
OCRPROG="$OPTARG"
|
OCRPROG="$arg"
|
||||||
;;
|
;;
|
||||||
p)
|
p)
|
||||||
PREVIEW=1;
|
PREVIEW=1;
|
||||||
info "preview enabled"
|
info "preview enabled"
|
||||||
;;
|
;;
|
||||||
s)
|
s)
|
||||||
SCANNEROPTS="-scanner \"$OPTARG\"";
|
SCANNEROPTS="-scanner \"$arg\"";
|
||||||
;;
|
;;
|
||||||
T)
|
T)
|
||||||
info "temp mode"
|
info "temp mode"
|
||||||
|
@ -201,21 +204,32 @@ while getopts "$validargs" i $ALLARGS; do
|
||||||
info "test mode"
|
info "test mode"
|
||||||
TEST=1;
|
TEST=1;
|
||||||
;;
|
;;
|
||||||
-f)
|
f)
|
||||||
info "tag ^b$OPTARG^p will use financial year paths"
|
info "tag ^b$arg^p will use financial year paths"
|
||||||
FINYEARTAGS="$FINYEARTAGS $OPTARG"
|
FINYEARTAGS="$FINYEARTAGS $arg"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
error "invalid argument: $i";
|
error "invalid argument: $i";
|
||||||
usage;
|
usage;
|
||||||
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
validargs="cdf:hlmMops:Ttv"
|
||||||
|
while getopts "$validargs" i $ALLARGS; do
|
||||||
|
processarg "$i" "$OPTARG"
|
||||||
|
done
|
||||||
|
unset OPTIND
|
||||||
|
|
||||||
|
while getopts "$validargs" i $*; do
|
||||||
|
processarg "$i" "$OPTARG"
|
||||||
done
|
done
|
||||||
shift $((OPTIND - 1))
|
shift $((OPTIND - 1))
|
||||||
|
|
||||||
if [[ $OCR -eq 1 ]]; then
|
if [[ $OCR -eq 1 ]]; then
|
||||||
if [[ -x $OCRPROG ]]; then
|
if [[ -x $OCRPROG ]]; then
|
||||||
info "OCR mode enabled"
|
|
||||||
OCR=1;
|
OCR=1;
|
||||||
else
|
else
|
||||||
wtext="OCR requested but ^b$OCRPROG^p not found."
|
wtext="OCR requested but ^b$OCRPROG^p not found."
|
||||||
|
@ -329,7 +343,7 @@ while [[ $finished -eq 0 ]]; do
|
||||||
if [[ $VERBOSE -eq 1 ]]; then
|
if [[ $VERBOSE -eq 1 ]]; then
|
||||||
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
|
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
|
||||||
fi
|
fi
|
||||||
notify "Scanning..."
|
notify "Scanning"
|
||||||
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
|
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
|
||||||
rv=$?
|
rv=$?
|
||||||
|
|
||||||
|
@ -398,7 +412,7 @@ fi
|
||||||
|
|
||||||
PREVIEWFILE=""
|
PREVIEWFILE=""
|
||||||
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
|
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
|
||||||
itext="Scanned $NUMPAGES page(s) to "
|
itext="Scanned ^b$NUMPAGES^p page(s) to "
|
||||||
count=1
|
count=1
|
||||||
for x in $PDFFILE; do
|
for x in $PDFFILE; do
|
||||||
if [[ $count -eq 1 ]]; then
|
if [[ $count -eq 1 ]]; then
|
||||||
|
@ -416,19 +430,28 @@ else
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $OCR -eq 1 ]]; then
|
if [[ $OCR -eq 1 ]]; then
|
||||||
|
notify "Running OCR"
|
||||||
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
|
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
|
||||||
|
|
||||||
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
|
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
|
||||||
if [[ $? -eq 0 ]]; then
|
if [[ $? -eq 0 ]]; then
|
||||||
|
ok
|
||||||
if [[ -x $P2TPROG ]]; then
|
if [[ -x $P2TPROG ]]; then
|
||||||
words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c)
|
allwords=$($P2TPROG "${PREVIEWFILE}" /dev/stdout)
|
||||||
|
words=$(echo "$allwords" | wc -w | bc)
|
||||||
|
mostfreq=$(echo "$allwords" | tr ' ' '\n' 2>/dev/null | tr -dc 'A-Za-z\n' 2>/dev/null | awk NF | egrep '[a-z]' | sort | uniq -c | sort -n | tail -1)
|
||||||
|
mostfreq_word=$(echo "$mostfreq" | awk '{ print $2 }')
|
||||||
|
mostfreq_count=$(echo "$mostfreq" | awk '{ print $1 }')
|
||||||
[[ $words -eq 1 ]] && ess="" || ess="s"
|
[[ $words -eq 1 ]] && ess="" || ess="s"
|
||||||
inform "OCR complete. Found ^b${word}^p word${ess}"
|
[[ $mostfreq_count -eq 1 ]] && occ_ess="" || occ_ess="s"
|
||||||
|
|
||||||
|
inform "Found ^b${words}^p word${ess}, most frequent was ^b${mostfreq_word}^p with ^b${mostfreq_count}^p occurence${occ_ess}."
|
||||||
|
echo "$allwords" >/tmp/a
|
||||||
else
|
else
|
||||||
inform "OCR complete. Install pdftotext for word counts."
|
warn "pdftotext not installed - skipping word count"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
error "OCR failed"
|
fail
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue