Improve word summary when doing OCR.

2022-07-23 12:05:21 +10:00 · 2022-07-23 12:05:21 +10:00 · fd8bd1da18
parent cb2c50f973
commit fd8bd1da18
1 changed files with 39 additions and 16 deletions
--- a/55
+++ b/55
@ -115,7 +115,7 @@ function usage() {
 OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
 OCRPROG="$OCRPROG_DEF"
-P2TPROG="pdftotext"
+P2TPROG="/opt/homebrew/bin/pdftotext"
 DUPLEXOPTS=""
 MODE="scan"
 DIR=${PDFDIR:-"~/Documents"}
@ -131,7 +131,7 @@ TEST=0
 FINYEARTAGS=""
 VERBOSE=0
-ALLARGS="$@"
+ALLARGS=""
 if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
    notify "processing $RCFILE"
@ -149,8 +149,10 @@ if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
    ok
 fi
-validargs="cdf:hlmMops:Ttv"
+function processarg() {
-while getopts "$validargs" i $ALLARGS; do
+    local i=$1
    local arg=$2
    case "$i" in
        v)
            VERBOSE=1;
@ -182,16 +184,17 @@ while getopts "$validargs" i $ALLARGS; do
            ;;
        o)
            OCR=1;
            info "OCR mode enabled"
            ;;
        O)
-            OCRPROG="$OPTARG"
+            OCRPROG="$arg"
            ;;
        p)
            PREVIEW=1;
            info "preview enabled"
            ;;
        s)
-            SCANNEROPTS="-scanner \"$OPTARG\"";
+            SCANNEROPTS="-scanner \"$arg\"";
            ;;
        T)
            info "temp mode"
@ -201,21 +204,32 @@ while getopts "$validargs" i $ALLARGS; do
            info "test mode"
            TEST=1;
            ;;
-        -f)
+        f)
-            info "tag ^b$OPTARG^p will use financial year paths"
+            info "tag ^b$arg^p will use financial year paths"
-            FINYEARTAGS="$FINYEARTAGS $OPTARG"
+            FINYEARTAGS="$FINYEARTAGS $arg"
            ;;
        *)
            error "invalid argument:  $i";
            usage;
            exit 1
            ;;
    esac
 }
 validargs="cdf:hlmMops:Ttv"
 while getopts "$validargs" i $ALLARGS; do
    processarg "$i" "$OPTARG"
 done
 unset OPTIND
 while getopts "$validargs" i $*; do
    processarg "$i" "$OPTARG"
 done
 shift $((OPTIND - 1))
 if [[ $OCR -eq 1 ]]; then
    if [[ -x $OCRPROG ]]; then
        info "OCR mode enabled"
        OCR=1;
    else
        wtext="OCR requested but ^b$OCRPROG^p not found."
@ -329,7 +343,7 @@ while [[ $finished -eq 0 ]]; do
    if [[ $VERBOSE -eq 1 ]]; then
        info "will run:  ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
    fi
-    notify "Scanning..."
+    notify "Scanning"
    OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
    rv=$?
@ -398,7 +412,7 @@ fi
 PREVIEWFILE=""
 if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
-    itext="Scanned $NUMPAGES page(s) to "
+    itext="Scanned ^b$NUMPAGES^p page(s) to "
    count=1
    for x in $PDFFILE; do
        if [[ $count -eq 1 ]]; then
@ -416,19 +430,28 @@ else
 fi
 if [[ $OCR -eq 1 ]]; then
    notify "Running OCR"
    OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
    $OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
    if [[ $? -eq 0 ]]; then
        ok
        if [[ -x $P2TPROG ]]; then
-            words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c)
+            allwords=$($P2TPROG "${PREVIEWFILE}" /dev/stdout)
            words=$(echo "$allwords" | wc -w | bc)
            mostfreq=$(echo "$allwords" | tr ' ' '\n' 2>/dev/null | tr -dc 'A-Za-z\n' 2>/dev/null | awk NF | egrep '[a-z]' | sort | uniq -c | sort -n | tail -1)
            mostfreq_word=$(echo "$mostfreq" | awk '{ print $2 }')
            mostfreq_count=$(echo "$mostfreq" | awk '{ print $1 }')
            [[ $words -eq 1 ]] && ess="" || ess="s"
-            inform "OCR complete.  Found ^b${word}^p word${ess}"
+            [[ $mostfreq_count -eq 1 ]] && occ_ess="" || occ_ess="s"
            inform "Found ^b${words}^p word${ess}, most frequent was ^b${mostfreq_word}^p with ^b${mostfreq_count}^p occurence${occ_ess}."
 echo "$allwords" >/tmp/a
        else
-            inform "OCR complete.  Install pdftotext for word counts."
+            warn "pdftotext not installed - skipping word count"
        fi
    else
-        error "OCR failed"
+        fail
    fi
 fi