Improve word summary when doing OCR.
This commit is contained in:
parent
cb2c50f973
commit
fd8bd1da18
55
scan
55
scan
|
@ -115,7 +115,7 @@ function usage() {
|
|||
|
||||
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
|
||||
OCRPROG="$OCRPROG_DEF"
|
||||
P2TPROG="pdftotext"
|
||||
P2TPROG="/opt/homebrew/bin/pdftotext"
|
||||
DUPLEXOPTS=""
|
||||
MODE="scan"
|
||||
DIR=${PDFDIR:-"~/Documents"}
|
||||
|
@ -131,7 +131,7 @@ TEST=0
|
|||
FINYEARTAGS=""
|
||||
VERBOSE=0
|
||||
|
||||
ALLARGS="$@"
|
||||
ALLARGS=""
|
||||
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
||||
notify "processing $RCFILE"
|
||||
|
||||
|
@ -149,8 +149,10 @@ if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
|||
ok
|
||||
fi
|
||||
|
||||
validargs="cdf:hlmMops:Ttv"
|
||||
while getopts "$validargs" i $ALLARGS; do
|
||||
function processarg() {
|
||||
local i=$1
|
||||
local arg=$2
|
||||
|
||||
case "$i" in
|
||||
v)
|
||||
VERBOSE=1;
|
||||
|
@ -182,16 +184,17 @@ while getopts "$validargs" i $ALLARGS; do
|
|||
;;
|
||||
o)
|
||||
OCR=1;
|
||||
info "OCR mode enabled"
|
||||
;;
|
||||
O)
|
||||
OCRPROG="$OPTARG"
|
||||
OCRPROG="$arg"
|
||||
;;
|
||||
p)
|
||||
PREVIEW=1;
|
||||
info "preview enabled"
|
||||
;;
|
||||
s)
|
||||
SCANNEROPTS="-scanner \"$OPTARG\"";
|
||||
SCANNEROPTS="-scanner \"$arg\"";
|
||||
;;
|
||||
T)
|
||||
info "temp mode"
|
||||
|
@ -201,21 +204,32 @@ while getopts "$validargs" i $ALLARGS; do
|
|||
info "test mode"
|
||||
TEST=1;
|
||||
;;
|
||||
-f)
|
||||
info "tag ^b$OPTARG^p will use financial year paths"
|
||||
FINYEARTAGS="$FINYEARTAGS $OPTARG"
|
||||
f)
|
||||
info "tag ^b$arg^p will use financial year paths"
|
||||
FINYEARTAGS="$FINYEARTAGS $arg"
|
||||
;;
|
||||
*)
|
||||
error "invalid argument: $i";
|
||||
usage;
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
}
|
||||
|
||||
validargs="cdf:hlmMops:Ttv"
|
||||
while getopts "$validargs" i $ALLARGS; do
|
||||
processarg "$i" "$OPTARG"
|
||||
done
|
||||
unset OPTIND
|
||||
|
||||
while getopts "$validargs" i $*; do
|
||||
processarg "$i" "$OPTARG"
|
||||
done
|
||||
shift $((OPTIND - 1))
|
||||
|
||||
if [[ $OCR -eq 1 ]]; then
|
||||
if [[ -x $OCRPROG ]]; then
|
||||
info "OCR mode enabled"
|
||||
OCR=1;
|
||||
else
|
||||
wtext="OCR requested but ^b$OCRPROG^p not found."
|
||||
|
@ -329,7 +343,7 @@ while [[ $finished -eq 0 ]]; do
|
|||
if [[ $VERBOSE -eq 1 ]]; then
|
||||
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
|
||||
fi
|
||||
notify "Scanning..."
|
||||
notify "Scanning"
|
||||
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
|
||||
rv=$?
|
||||
|
||||
|
@ -398,7 +412,7 @@ fi
|
|||
|
||||
PREVIEWFILE=""
|
||||
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
|
||||
itext="Scanned $NUMPAGES page(s) to "
|
||||
itext="Scanned ^b$NUMPAGES^p page(s) to "
|
||||
count=1
|
||||
for x in $PDFFILE; do
|
||||
if [[ $count -eq 1 ]]; then
|
||||
|
@ -416,19 +430,28 @@ else
|
|||
fi
|
||||
|
||||
if [[ $OCR -eq 1 ]]; then
|
||||
notify "Running OCR"
|
||||
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
|
||||
|
||||
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
|
||||
if [[ $? -eq 0 ]]; then
|
||||
ok
|
||||
if [[ -x $P2TPROG ]]; then
|
||||
words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c)
|
||||
allwords=$($P2TPROG "${PREVIEWFILE}" /dev/stdout)
|
||||
words=$(echo "$allwords" | wc -w | bc)
|
||||
mostfreq=$(echo "$allwords" | tr ' ' '\n' 2>/dev/null | tr -dc 'A-Za-z\n' 2>/dev/null | awk NF | egrep '[a-z]' | sort | uniq -c | sort -n | tail -1)
|
||||
mostfreq_word=$(echo "$mostfreq" | awk '{ print $2 }')
|
||||
mostfreq_count=$(echo "$mostfreq" | awk '{ print $1 }')
|
||||
[[ $words -eq 1 ]] && ess="" || ess="s"
|
||||
inform "OCR complete. Found ^b${word}^p word${ess}"
|
||||
[[ $mostfreq_count -eq 1 ]] && occ_ess="" || occ_ess="s"
|
||||
|
||||
inform "Found ^b${words}^p word${ess}, most frequent was ^b${mostfreq_word}^p with ^b${mostfreq_count}^p occurence${occ_ess}."
|
||||
echo "$allwords" >/tmp/a
|
||||
else
|
||||
inform "OCR complete. Install pdftotext for word counts."
|
||||
warn "pdftotext not installed - skipping word count"
|
||||
fi
|
||||
else
|
||||
error "OCR failed"
|
||||
fail
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
Loading…
Reference in New Issue