Add -o to run final pdf through ocrmypdf

Replace tabs with spaces
Use getopts instead of getopt
Use bashtools.sh
Code tidy up
This commit is contained in:
Rob Pearce 2022-07-10 11:32:06 +10:00
parent d76c74c144
commit 5ffa23ed10
2 changed files with 214 additions and 203 deletions

0
README.md Normal file → Executable file
View File

417
scan
View File

@ -2,16 +2,11 @@
SCANLINE=/usr/local/bin/scanline SCANLINE=/usr/local/bin/scanline
# ANSI stuff . ${HOME}/.bashtools/bashtools.sh
BOLD="\033[1m" if [[ -z $HAVE_BASHTOOLS ]]; then
PLAIN="\033[0m" echo "ERROR: bashtools not installed, download from https://git.nethack.net/rob/bashtools" >/dev/stderr
UNDERLINE="\033[4m" exit 1
RED="\033[31m" fi
GREEN="\033[32m"
BLUE="\033[34m"
CYAN="\033[36m"
LINK="$BLUE$UNDERLINE"
function autotags() { function autotags() {
local file idx local file idx
@ -25,32 +20,12 @@ function autotags() {
else else
TAGS="$TAGS ${ADDTAG[$idx]}" TAGS="$TAGS ${ADDTAG[$idx]}"
fi fi
action "* Inferred tag '${BOLD}${ADDTAG[$idx]}${PLAIN}${CYAN}' from filename." inform "* Inferred tag '^b${ADDTAG[$idx]}^p' from filename."
fi fi
idx=$((idx+1)) idx=$((idx+1))
done done
} }
function cecho() {
local COL
COL="$1"
shift
echo -en "$COL"
echo -e "$*${PLAIN}"
}
function info() {
cecho "$BLUE" "$*"
}
function action() {
cecho "$CYAN" "$*"
}
function error() {
cecho "$RED" "ERROR: $*" >/dev/stderr
}
function mount_local() { # $1=mountpoint function mount_local() { # $1=mountpoint
local mydir local mydir
mydir=$2 mydir=$2
@ -72,9 +47,9 @@ function mount_local() { # $1=mountpoint
# check again... # check again...
is_mounted "$mydir" is_mounted "$mydir"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
error "$mydir could not be mounted." error "$mydir could not be mounted."
sudo rmdir ${mydir} sudo rmdir ${mydir}
return 1 return 1
fi fi
return 0 return 0
} }
@ -94,9 +69,9 @@ function mount_samba() { # $1=share $2=mountpoint
# check again... # check again...
is_mounted "$mydir" is_mounted "$mydir"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
error "$mydir could not be mounted." error "$mydir could not be mounted."
sudo rmdir ${mydir} sudo rmdir ${mydir}
return 1 return 1
fi fi
return 0 return 0
} }
@ -113,32 +88,38 @@ function is_mounted() {
} }
function usage() { function usage() {
echo "usage: $0 [OPTIONS] filename tag1 [tag2] [tag3] ... [tagX]" echo "usage: $0 [OPTIONS] filename tag1 [tag2] [tag3] ... [tagX]"
echo "" echo ""
echo " Scans to: \$PDFDIR/tag1/<year>/filename" echo " Scans to: \$PDFDIR/tag1/<year>/filename"
echo " Creates symlinks in:" echo " Creates symlinks in:"
echo " \$PDFDIR/tag2/<year>/filename" echo " \$PDFDIR/tag2/<year>/filename"
echo " \$PDFDIR/tag3/<year>/filename" echo " \$PDFDIR/tag3/<year>/filename"
echo " ...etc..." echo " ...etc..."
echo "" echo ""
echo " -d scan in duplex mode" echo " -d scan in duplex mode"
echo " -f xxx for given tag, use financial year in path rather than calendar year" echo " -f xxx for given tag, use financial year in path rather than calendar year"
echo " -h show this text" echo " -h show this text"
echo " -l list all available scanners" echo " -l list all available scanners"
echo " -m multi-page mode (prompts to load new pages each time)" echo " -m multi-page mode (prompts to load new pages each time)"
echo " -p preview document after scanning" echo " -o use ocrmypdf to straighten document, extract text into pdf and use tags as keyword metadata"
echo " -s xxx select scanner to use" echo " -O xxx specify path to ocrmypdf"
echo " -T Temporary mode - scan to /tmp/a.pdf" echo " -p preview document after scanning"
echo " -v verbose mode" echo " -s xxx select scanner to use"
echo "" echo " -T Temporary mode - scan to /tmp/a.pdf"
echo " -v verbose mode"
echo ""
} }
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
OCRPROG="$OCRPROG_DEF"
P2TPROG="pdftotext"
DUPLEXOPTS="" DUPLEXOPTS=""
MODE="scan" MODE="scan"
DIR=${PDFDIR:-"~/Documents"} DIR=${PDFDIR:-"~/Documents"}
SHARE=${PDFSHARE:-"//rob@nas.nethack.net:/pdfs"} SHARE=${PDFSHARE:-"//rob@nas.nethack.net:/pdfs"}
SCANNEROPTS="" SCANNEROPTS=""
MULTIPAGE=0 MULTIPAGE=0
OCR=0
PREVIEW=0 PREVIEW=0
RCFILE=${HOME}/.scanrc RCFILE=${HOME}/.scanrc
TAGS="" TAGS=""
@ -147,9 +128,9 @@ TEST=0
FINYEARTAGS="" FINYEARTAGS=""
VERBOSE=0 VERBOSE=0
ALLARGS="$*" ALLARGS="$@"
if [[ -e $RCFILE ]]; then if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
info "[processing $RCFILE]" notify "processing $RCFILE"
while read -r f ; do while read -r f ; do
if [[ $f =~ ^auto\ || $f =~ ^at\ || $f =~ ^autotag\ ]]; then if [[ $f =~ ^auto\ || $f =~ ^at\ || $f =~ ^autotag\ ]]; then
@ -162,155 +143,170 @@ if [[ -e $RCFILE ]]; then
ALLARGS="$f $ALLARGS" ALLARGS="$f $ALLARGS"
fi fi
done < <(egrep -v "(^#|^$)" $RCFILE) done < <(egrep -v "(^#|^$)" $RCFILE)
ok
fi fi
validargs="cdf:hlmMops:Ttv"
ARGS=$(getopt cdf:hlmMps:Ttv $ALLARGS) while getopts "$validargs" i $ALLARGS; do
eval set -- $ARGS case "$i" in
for i do v)
case "$i" in VERBOSE=1;
-v) info "verbose mode"
VERBOSE=1; shift 1; ;;
info "[verbose mode]" d)
;; DUPLEXOPTS="-duplex";
-d) info "duplex mode"
DUPLEXOPTS="-duplex"; shift 1; ;;
info "[duplex mode]" h)
;; usage; exit 1;
-h) ;;
usage; exit 1; l)
;; ${SCANLINE} -list; exit 1;
-l) ;;
${SCANLINE} -list; exit 1; m)
;; MULTIPAGE=1;
-m) info "multi-page mode enabled"
MULTIPAGE=1; shift 1; ;;
info "[multi-page mode enabled]" M)
;; inform "${CYAN}Manually mounting ^b${DIR}^p, enter password if prompted"
-M) mount_samba "${SHARE}" "${DIR}"
shift 1; if [ $? -ne 0 ]; then
echo -ne "${CYAN}Manually mounting ${DIR}...$PLAIN" error "Mount failed"
mount_samba "${SHARE}" "${DIR}" exit 1
if [ $? -ne 0 ]; then fi
cecho $RED "failed" inform "Mount complete"
exit 1 exit 0
fi ;;
cecho $GREEN "done" o)
exit 0 OCR=1;
;; ;;
-p) O)
PREVIEW=1; shift 1; OCRPROG="$OPTARG"
info "[preview enabled]" ;;
;; p)
-s) PREVIEW=1;
SCANNEROPTS="-scanner \"$2\""; shift 2; info "preview enabled"
;; ;;
-T) s)
info "[temp mode]" SCANNEROPTS="-scanner \"$OPTARG\"";
TEMP=1; shift 1; ;;
;; T)
-t) info "temp mode"
info "[test mode]" TEMP=1;
TEST=1; shift 1; ;;
;; t)
-f) info "test mode"
info "[tag ${BOLD}$2${PLAIN}${BLUE} will use financial year paths]" TEST=1;
FINYEARTAGS="$FINYEARTAGS $2" ;;
shift 2 -f)
;; info "tag ^b$OPTARG^p will use financial year paths"
--) FINYEARTAGS="$FINYEARTAGS $OPTARG"
shift ;;
;; *)
esac error "invalid argument: $i";
usage;
;;
esac
done done
shift $((OPTIND - 1))
if [[ $OCR -eq 1 ]]; then
if [[ -x $OCRPROG ]]; then
info "OCR mode enabled"
OCR=1;
else
wtext="OCR requested but ^b$OCRPROG^p not found."
if [[ $OCRPROG == $OCRPROG_DEF ]]; then
wtext="${wtext} Use ^b-O^p to specify alternate binary."
fi
warn "$wtext"
fi
fi
if [[ $TEMP -eq 1 ]]; then if [[ $TEMP -eq 1 ]]; then
DIR=/tmp DIR=/tmp
FILENAME=a FILENAME=a
else else
if [ $# -lt 1 ]; then if [ $# -lt 1 ]; then
usage usage
exit 1 exit 1
fi fi
# first arg is filename, rest are tags # first arg is filename, rest are tags
FILENAME=$1 FILENAME=$1
shift 1 shift 1
autotags "$FILENAME" # determine tags from filename
autotags "$FILENAME" # determine tags from filename while [[ $# -ge 1 ]]; do
while [[ $# -ge 1 ]]; do inform "Got tag '^b$1^p' on command line."
action "* Got tag '${BOLD}$1${PLAIN}${CYAN}' on command line."
if [[ -z $TAGS ]]; then
TAGS="$1"
else
TAGS="$TAGS $1"
fi
shift 1
done
if [[ -z $TAGS ]]; then if [[ -z $TAGS ]]; then
error "No tags specified or inferred from filename." TAGS="$1"
exit 1 else
fi TAGS="$TAGS $1"
# This will be the directory which scanline writes the pdf to
FIRSTTAG=`echo $TAGS | awk '{ print $1 }'`
# Remove duplicate tags
TAGS=`echo $TAGS | tr ' ' '\n' | sort -u | tr '\n' ' ' | sed -e 's/ $//'`
if [[ $TEST -eq 1 ]]; then
echo -e "Tags found: ${GREEN}${TAGS}${PLAIN}"
echo -e "PDF will be written to ${BOLD}${DIR}/${GREEN}${FIRSTTAG}${PLAIN}${BOLD}/${FILENAME}.pdf${PLAIN}."
echo -e "Symlinks will be created in:"
for t in $TAGS; do
if [[ $t != $FIRSTTAG ]]; then
echo -e " - ${DIR}/${GREEN}${t}${PLAIN}/${FILENAME}.pdf${PLAIN}"
fi
done
exit 0
fi fi
shift 1
done
if [[ -z $TAGS ]]; then
error "No tags specified or inferred from filename."
exit 1
fi
# This will be the directory which scanline writes the pdf to
FIRSTTAG=`echo $TAGS | awk '{ print $1 }'`
# Remove duplicate tags
TAGS=`echo $TAGS | tr ' ' '\n' | sort -u | tr '\n' ' ' | sed -e 's/ $//'`
if [[ $TEST -eq 1 ]]; then
inform "Tags found: ${GREEN}${BOLD}${TAGS}^p"
inform "PDF will be written to ^b${DIR}/${GREEN}${BOLD}${FIRSTTAG}^p^b/${FILENAME}.pdf."
inform "Symlinks will be created in:"
for t in $TAGS; do
if [[ $t != $FIRSTTAG ]]; then
inform " - ${DIR}/${GREEN}${t}^p${FILENAME}.pdf"
fi
done
exit 0
fi
fi fi
if [[ $TEMP -eq 0 ]]; then if [[ $TEMP -eq 0 ]]; then
# Check that target pdfs share is mounted # Check that target pdfs share is mounted
if [[ $DIR =~ mnt|pdf ]]; then if [[ $DIR =~ mnt|pdf ]]; then
df -h ${DIR} 2>&1 | grep @ >/dev/null 2>&1 df -h ${DIR} 2>&1 | grep @ >/dev/null 2>&1
is_mounted "$DIR" is_mounted "$DIR"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
info "$DIR not mounted - trying to mount it..." info "$DIR not mounted - trying to mount it..."
#mount_samba "${SHARE}" "${DIR}" #mount_samba "${SHARE}" "${DIR}"
mount_local "${DIR}" mount_local "${DIR}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
fi
fi
# Check that we didn't mix up the filename and tags
if ! [ -d ${DIR}/${FIRSTTAG} ] ; then
error "$DIR/$FIRSTTAG doesn't exist, did you mix up filename and tags?"
exit 1
fi
if [[ $FILENAME == */* ]]; then
error "Filename ${BOLD}$FILENAME${PLAIN}${RED} contains a slash - did you mix up filename and tags?"
exit 1
fi fi
fi
if [[ $FILENAME == *,* ]]; then # Check that we didn't mix up the filename and tags
error "Filename ${BOLD}$FILENAME${PLAIN}${RED} is illegal - commas not allowed." if ! [ -d ${DIR}/${FIRSTTAG} ] ; then
exit 1 error "$DIR/$FIRSTTAG doesn't exist, did you mix up filename and tags?"
fi exit 1
fi
if [[ $FILENAME == */* ]]; then
error "Filename ^b$FILENAME^p contains a slash - did you mix up filename and tags?"
exit 1
fi
if [[ $FILENAME == *,* ]]; then
error "Filename ^b$FILENAME^p is illegal - commas not allowed."
exit 1
fi
fi fi
# do the scan - capture output # do the scan - capture output
#${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* #${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $*
#exec 5>&1 #exec 5>&1
#OUTPUT=$( ${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* 2>&1 |tee /dev/fd/5; exit ${PIPESTATUS[0]}) #OUTPUT=$( ${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* 2>&1 |tee /dev/fd/5; exit ${PIPESTATUS[0]})
#rv=$? #rv=$?
finished=0 finished=0
NUMPAGES=0 NUMPAGES=0
TEMPFILE=`mktemp /tmp/scan.XXXXXX` TEMPFILE=`mktemp /tmp/scan.XXXXXX`
@ -327,22 +323,21 @@ while [[ $finished -eq 0 ]]; do
fi fi
# scan new file # scan new file
if [[ $VERBOSE -eq 1 ]]; then if [[ $VERBOSE -eq 1 ]]; then
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1" info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
fi fi
echo -e -n "${CYAN}Scanning..." notify "Scanning..."
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1) OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
rv=$? rv=$?
if [ $rv -eq 0 ]; then if [ $rv -eq 0 ]; then
cecho $GREEN "done" ok "done"
else else
cecho $RED "failed" fail
echo "" echo ""
echo -e "${BOLD}${UNDERLINE}Full output:${PLAIN}" csecho "$RED" "^bFull output:^p"
echo "$OUTPUT" | sed -e 's/^/ /' csecho -e "$RED" "$OUTPUT" | sed -e 's/^/ /'
exit 1 exit 1
fi fi
thisnpages=`printf %d $( echo "$OUTPUT" | egrep '^(Scan complete|didScanTo)' | wc -l )` thisnpages=`printf %d $( echo "$OUTPUT" | egrep '^(Scan complete|didScanTo)' | wc -l )`
NUMPAGES=$(( $NUMPAGES + $thisnpages )) NUMPAGES=$(( $NUMPAGES + $thisnpages ))
@ -358,7 +353,7 @@ while [[ $finished -eq 0 ]]; do
else else
# multi-line mode # multi-line mode
cecho $GREEN "Scanned + $thisnpages page(s) -> $NUMPAGES total." csecho "$GREEN" "Scanned +^b$thisnpages^p page(s) -> ^b$NUMPAGES^p total."
if [[ $gotexisting -eq 1 ]]; then if [[ $gotexisting -eq 1 ]]; then
# append newly-scanned page file on to rest of the pdf. # append newly-scanned page file on to rest of the pdf.
pdftk ${TEMPFILE} ${PDFFILE} cat output ${TEMPFILE2} pdftk ${TEMPFILE} ${PDFFILE} cat output ${TEMPFILE2}
@ -367,8 +362,7 @@ while [[ $finished -eq 0 ]]; do
rm -rf ${TEMPFILE} rm -rf ${TEMPFILE}
fi fi
echo -en "Insert next pages and press ${BOLD}ENTER${PLAIN}, or type '${BOLD}n${PLAIN}':${PLAIN}" ask "Insert next pages and press ^bENTER^p, or type '^bn^p':" "y" yn
read -p " " yn
if [[ $yn == "n" ]]; then if [[ $yn == "n" ]]; then
finished=1 finished=1
fi fi
@ -391,7 +385,7 @@ if [[ $curmonth -ge 7 ]]; then
for t in $FINYEARTAGS; do for t in $FINYEARTAGS; do
if [[ $x == */${t}/* ]]; then if [[ $x == */${t}/* ]]; then
newname=`echo "$x" | sed -e "s,/${t}/$curyear/,/${t}/$nextyear/,"` newname=`echo "$x" | sed -e "s,/${t}/$curyear/,/${t}/$nextyear/,"`
action "* Adjusting path for financial year: $x -> ${BOLD}$newname${PLAIN}" inform "Adjusting path for financial year: $x -> ^b$newname^p"
mv -f "$x" "$newname" mv -f "$x" "$newname"
PDFFILE=$(echo "$PDFFILE" | sed -e "s,${x},${newname},") PDFFILE=$(echo "$PDFFILE" | sed -e "s,${x},${newname},")
fi fi
@ -401,29 +395,46 @@ fi
PREVIEWFILE="" PREVIEWFILE=""
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
echo -e -n "${GREEN}Scanned $NUMPAGES page(s) to " itext="Scanned $NUMPAGES page(s) to "
count=1 count=1
for x in $PDFFILE; do for x in $PDFFILE; do
if [[ $count -eq 1 ]]; then if [[ $count -eq 1 ]]; then
PREVIEWFILE="$x" PREVIEWFILE="$x"
echo -e -n "${BOLD}${GREEN}${x}${PLAIN}" itext="${itext}^b${x}^p"
else else
echo -e -n "${GREEN} + ${BOLD}${x}${PLAIN}" itext="${itext} + ^b${x}^p"
fi fi
count=$((count + 1)) count=$((count + 1))
done done
echo -e "${PLAIN}" inform "$itext"
else else
PREVIEWFILE="$PDFFILE" PREVIEWFILE="$PDFFILE"
cecho $GREEN "Scanned $NUMPAGES page(s) to ${BOLD}${PDFFILE}${PLAIN}" inform "Scanned $NUMPAGES page(s) to ^b${PDFFILE}^p"
fi
if [[ $OCR -eq 1 ]]; then
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
if [[ $? -eq 0 ]]; then
if [[ -x $P2TPROG ]]; then
words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c)
[[ $words -eq 1 ]] && ess="" || ess="s"
inform "OCR complete. Found ^b${word}^p word${ess}"
else
inform "OCR complete. Install pdftotext for word counts."
fi
else
error "OCR failed"
fi
fi fi
# Put the full path onto the copy buffer # Put the full path onto the copy buffer
echo -n "${PREVIEWFILE}" | pbcopy echo -n "${PREVIEWFILE}" | pbcopy
if [ $PREVIEW -eq 1 ]; then if [ $PREVIEW -eq 1 ]; then
action "Showing preview..." inform "Showing preview..."
open "${PREVIEWFILE}" open "${PREVIEWFILE}"
fi fi