From fd8bd1da18b11794a810f881441b69e99a7ed3e2 Mon Sep 17 00:00:00 2001 From: Rob Pearce Date: Sat, 23 Jul 2022 12:05:21 +1000 Subject: [PATCH] Improve word summary when doing OCR. --- scan | 55 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/scan b/scan index 40d9f68..9396281 100755 --- a/scan +++ b/scan @@ -115,7 +115,7 @@ function usage() { OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf" OCRPROG="$OCRPROG_DEF" -P2TPROG="pdftotext" +P2TPROG="/opt/homebrew/bin/pdftotext" DUPLEXOPTS="" MODE="scan" DIR=${PDFDIR:-"~/Documents"} @@ -131,7 +131,7 @@ TEST=0 FINYEARTAGS="" VERBOSE=0 -ALLARGS="$@" +ALLARGS="" if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then notify "processing $RCFILE" @@ -149,8 +149,10 @@ if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then ok fi -validargs="cdf:hlmMops:Ttv" -while getopts "$validargs" i $ALLARGS; do +function processarg() { + local i=$1 + local arg=$2 + case "$i" in v) VERBOSE=1; @@ -182,16 +184,17 @@ while getopts "$validargs" i $ALLARGS; do ;; o) OCR=1; + info "OCR mode enabled" ;; O) - OCRPROG="$OPTARG" + OCRPROG="$arg" ;; p) PREVIEW=1; info "preview enabled" ;; s) - SCANNEROPTS="-scanner \"$OPTARG\""; + SCANNEROPTS="-scanner \"$arg\""; ;; T) info "temp mode" @@ -201,21 +204,32 @@ while getopts "$validargs" i $ALLARGS; do info "test mode" TEST=1; ;; - -f) - info "tag ^b$OPTARG^p will use financial year paths" - FINYEARTAGS="$FINYEARTAGS $OPTARG" + f) + info "tag ^b$arg^p will use financial year paths" + FINYEARTAGS="$FINYEARTAGS $arg" ;; *) error "invalid argument: $i"; usage; + exit 1 ;; esac + +} + +validargs="cdf:hlmMops:Ttv" +while getopts "$validargs" i $ALLARGS; do + processarg "$i" "$OPTARG" +done +unset OPTIND + +while getopts "$validargs" i $*; do + processarg "$i" "$OPTARG" done shift $((OPTIND - 1)) if [[ $OCR -eq 1 ]]; then if [[ -x $OCRPROG ]]; then - info "OCR mode enabled" OCR=1; else wtext="OCR requested but ^b$OCRPROG^p not found." @@ -329,7 +343,7 @@ while [[ $finished -eq 0 ]]; do if [[ $VERBOSE -eq 1 ]]; then info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1" fi - notify "Scanning..." + notify "Scanning" OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1) rv=$? @@ -398,7 +412,7 @@ fi PREVIEWFILE="" if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then - itext="Scanned $NUMPAGES page(s) to " + itext="Scanned ^b$NUMPAGES^p page(s) to " count=1 for x in $PDFFILE; do if [[ $count -eq 1 ]]; then @@ -416,19 +430,28 @@ else fi if [[ $OCR -eq 1 ]]; then + notify "Running OCR" OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'" $OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1 if [[ $? -eq 0 ]]; then + ok if [[ -x $P2TPROG ]]; then - words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c) + allwords=$($P2TPROG "${PREVIEWFILE}" /dev/stdout) + words=$(echo "$allwords" | wc -w | bc) + mostfreq=$(echo "$allwords" | tr ' ' '\n' 2>/dev/null | tr -dc 'A-Za-z\n' 2>/dev/null | awk NF | egrep '[a-z]' | sort | uniq -c | sort -n | tail -1) + mostfreq_word=$(echo "$mostfreq" | awk '{ print $2 }') + mostfreq_count=$(echo "$mostfreq" | awk '{ print $1 }') [[ $words -eq 1 ]] && ess="" || ess="s" - inform "OCR complete. Found ^b${word}^p word${ess}" + [[ $mostfreq_count -eq 1 ]] && occ_ess="" || occ_ess="s" + + inform "Found ^b${words}^p word${ess}, most frequent was ^b${mostfreq_word}^p with ^b${mostfreq_count}^p occurence${occ_ess}." +echo "$allwords" >/tmp/a else - inform "OCR complete. Install pdftotext for word counts." + warn "pdftotext not installed - skipping word count" fi else - error "OCR failed" + fail fi fi