Improve word summary when doing OCR.

This commit is contained in:
Rob Pearce 2022-07-23 12:05:21 +10:00
parent cb2c50f973
commit fd8bd1da18
1 changed files with 39 additions and 16 deletions

55
scan
View File

@ -115,7 +115,7 @@ function usage() {
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf" OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
OCRPROG="$OCRPROG_DEF" OCRPROG="$OCRPROG_DEF"
P2TPROG="pdftotext" P2TPROG="/opt/homebrew/bin/pdftotext"
DUPLEXOPTS="" DUPLEXOPTS=""
MODE="scan" MODE="scan"
DIR=${PDFDIR:-"~/Documents"} DIR=${PDFDIR:-"~/Documents"}
@ -131,7 +131,7 @@ TEST=0
FINYEARTAGS="" FINYEARTAGS=""
VERBOSE=0 VERBOSE=0
ALLARGS="$@" ALLARGS=""
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
notify "processing $RCFILE" notify "processing $RCFILE"
@ -149,8 +149,10 @@ if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
ok ok
fi fi
validargs="cdf:hlmMops:Ttv" function processarg() {
while getopts "$validargs" i $ALLARGS; do local i=$1
local arg=$2
case "$i" in case "$i" in
v) v)
VERBOSE=1; VERBOSE=1;
@ -182,16 +184,17 @@ while getopts "$validargs" i $ALLARGS; do
;; ;;
o) o)
OCR=1; OCR=1;
info "OCR mode enabled"
;; ;;
O) O)
OCRPROG="$OPTARG" OCRPROG="$arg"
;; ;;
p) p)
PREVIEW=1; PREVIEW=1;
info "preview enabled" info "preview enabled"
;; ;;
s) s)
SCANNEROPTS="-scanner \"$OPTARG\""; SCANNEROPTS="-scanner \"$arg\"";
;; ;;
T) T)
info "temp mode" info "temp mode"
@ -201,21 +204,32 @@ while getopts "$validargs" i $ALLARGS; do
info "test mode" info "test mode"
TEST=1; TEST=1;
;; ;;
-f) f)
info "tag ^b$OPTARG^p will use financial year paths" info "tag ^b$arg^p will use financial year paths"
FINYEARTAGS="$FINYEARTAGS $OPTARG" FINYEARTAGS="$FINYEARTAGS $arg"
;; ;;
*) *)
error "invalid argument: $i"; error "invalid argument: $i";
usage; usage;
exit 1
;; ;;
esac esac
}
validargs="cdf:hlmMops:Ttv"
while getopts "$validargs" i $ALLARGS; do
processarg "$i" "$OPTARG"
done
unset OPTIND
while getopts "$validargs" i $*; do
processarg "$i" "$OPTARG"
done done
shift $((OPTIND - 1)) shift $((OPTIND - 1))
if [[ $OCR -eq 1 ]]; then if [[ $OCR -eq 1 ]]; then
if [[ -x $OCRPROG ]]; then if [[ -x $OCRPROG ]]; then
info "OCR mode enabled"
OCR=1; OCR=1;
else else
wtext="OCR requested but ^b$OCRPROG^p not found." wtext="OCR requested but ^b$OCRPROG^p not found."
@ -329,7 +343,7 @@ while [[ $finished -eq 0 ]]; do
if [[ $VERBOSE -eq 1 ]]; then if [[ $VERBOSE -eq 1 ]]; then
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1" info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
fi fi
notify "Scanning..." notify "Scanning"
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1) OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
rv=$? rv=$?
@ -398,7 +412,7 @@ fi
PREVIEWFILE="" PREVIEWFILE=""
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
itext="Scanned $NUMPAGES page(s) to " itext="Scanned ^b$NUMPAGES^p page(s) to "
count=1 count=1
for x in $PDFFILE; do for x in $PDFFILE; do
if [[ $count -eq 1 ]]; then if [[ $count -eq 1 ]]; then
@ -416,19 +430,28 @@ else
fi fi
if [[ $OCR -eq 1 ]]; then if [[ $OCR -eq 1 ]]; then
notify "Running OCR"
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'" OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1 $OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
if [[ $? -eq 0 ]]; then if [[ $? -eq 0 ]]; then
ok
if [[ -x $P2TPROG ]]; then if [[ -x $P2TPROG ]]; then
words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c) allwords=$($P2TPROG "${PREVIEWFILE}" /dev/stdout)
words=$(echo "$allwords" | wc -w | bc)
mostfreq=$(echo "$allwords" | tr ' ' '\n' 2>/dev/null | tr -dc 'A-Za-z\n' 2>/dev/null | awk NF | egrep '[a-z]' | sort | uniq -c | sort -n | tail -1)
mostfreq_word=$(echo "$mostfreq" | awk '{ print $2 }')
mostfreq_count=$(echo "$mostfreq" | awk '{ print $1 }')
[[ $words -eq 1 ]] && ess="" || ess="s" [[ $words -eq 1 ]] && ess="" || ess="s"
inform "OCR complete. Found ^b${word}^p word${ess}" [[ $mostfreq_count -eq 1 ]] && occ_ess="" || occ_ess="s"
inform "Found ^b${words}^p word${ess}, most frequent was ^b${mostfreq_word}^p with ^b${mostfreq_count}^p occurence${occ_ess}."
echo "$allwords" >/tmp/a
else else
inform "OCR complete. Install pdftotext for word counts." warn "pdftotext not installed - skipping word count"
fi fi
else else
error "OCR failed" fail
fi fi
fi fi