Add -o to run final pdf through ocrmypdf
Replace tabs with spaces Use getopts instead of getopt Use bashtools.sh Code tidy up
This commit is contained in:
parent
d76c74c144
commit
5ffa23ed10
203
scan
203
scan
|
@ -2,16 +2,11 @@
|
||||||
|
|
||||||
SCANLINE=/usr/local/bin/scanline
|
SCANLINE=/usr/local/bin/scanline
|
||||||
|
|
||||||
# ANSI stuff
|
. ${HOME}/.bashtools/bashtools.sh
|
||||||
BOLD="\033[1m"
|
if [[ -z $HAVE_BASHTOOLS ]]; then
|
||||||
PLAIN="\033[0m"
|
echo "ERROR: bashtools not installed, download from https://git.nethack.net/rob/bashtools" >/dev/stderr
|
||||||
UNDERLINE="\033[4m"
|
exit 1
|
||||||
RED="\033[31m"
|
fi
|
||||||
GREEN="\033[32m"
|
|
||||||
BLUE="\033[34m"
|
|
||||||
CYAN="\033[36m"
|
|
||||||
LINK="$BLUE$UNDERLINE"
|
|
||||||
|
|
||||||
|
|
||||||
function autotags() {
|
function autotags() {
|
||||||
local file idx
|
local file idx
|
||||||
|
@ -25,32 +20,12 @@ function autotags() {
|
||||||
else
|
else
|
||||||
TAGS="$TAGS ${ADDTAG[$idx]}"
|
TAGS="$TAGS ${ADDTAG[$idx]}"
|
||||||
fi
|
fi
|
||||||
action "* Inferred tag '${BOLD}${ADDTAG[$idx]}${PLAIN}${CYAN}' from filename."
|
inform "* Inferred tag '^b${ADDTAG[$idx]}^p' from filename."
|
||||||
fi
|
fi
|
||||||
idx=$((idx+1))
|
idx=$((idx+1))
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
function cecho() {
|
|
||||||
local COL
|
|
||||||
COL="$1"
|
|
||||||
shift
|
|
||||||
echo -en "$COL"
|
|
||||||
echo -e "$*${PLAIN}"
|
|
||||||
}
|
|
||||||
|
|
||||||
function info() {
|
|
||||||
cecho "$BLUE" "$*"
|
|
||||||
}
|
|
||||||
|
|
||||||
function action() {
|
|
||||||
cecho "$CYAN" "$*"
|
|
||||||
}
|
|
||||||
|
|
||||||
function error() {
|
|
||||||
cecho "$RED" "ERROR: $*" >/dev/stderr
|
|
||||||
}
|
|
||||||
|
|
||||||
function mount_local() { # $1=mountpoint
|
function mount_local() { # $1=mountpoint
|
||||||
local mydir
|
local mydir
|
||||||
mydir=$2
|
mydir=$2
|
||||||
|
@ -126,6 +101,8 @@ function usage() {
|
||||||
echo " -h show this text"
|
echo " -h show this text"
|
||||||
echo " -l list all available scanners"
|
echo " -l list all available scanners"
|
||||||
echo " -m multi-page mode (prompts to load new pages each time)"
|
echo " -m multi-page mode (prompts to load new pages each time)"
|
||||||
|
echo " -o use ocrmypdf to straighten document, extract text into pdf and use tags as keyword metadata"
|
||||||
|
echo " -O xxx specify path to ocrmypdf"
|
||||||
echo " -p preview document after scanning"
|
echo " -p preview document after scanning"
|
||||||
echo " -s xxx select scanner to use"
|
echo " -s xxx select scanner to use"
|
||||||
echo " -T Temporary mode - scan to /tmp/a.pdf"
|
echo " -T Temporary mode - scan to /tmp/a.pdf"
|
||||||
|
@ -133,12 +110,16 @@ function usage() {
|
||||||
echo ""
|
echo ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
|
||||||
|
OCRPROG="$OCRPROG_DEF"
|
||||||
|
P2TPROG="pdftotext"
|
||||||
DUPLEXOPTS=""
|
DUPLEXOPTS=""
|
||||||
MODE="scan"
|
MODE="scan"
|
||||||
DIR=${PDFDIR:-"~/Documents"}
|
DIR=${PDFDIR:-"~/Documents"}
|
||||||
SHARE=${PDFSHARE:-"//rob@nas.nethack.net:/pdfs"}
|
SHARE=${PDFSHARE:-"//rob@nas.nethack.net:/pdfs"}
|
||||||
SCANNEROPTS=""
|
SCANNEROPTS=""
|
||||||
MULTIPAGE=0
|
MULTIPAGE=0
|
||||||
|
OCR=0
|
||||||
PREVIEW=0
|
PREVIEW=0
|
||||||
RCFILE=${HOME}/.scanrc
|
RCFILE=${HOME}/.scanrc
|
||||||
TAGS=""
|
TAGS=""
|
||||||
|
@ -147,9 +128,9 @@ TEST=0
|
||||||
FINYEARTAGS=""
|
FINYEARTAGS=""
|
||||||
VERBOSE=0
|
VERBOSE=0
|
||||||
|
|
||||||
ALLARGS="$*"
|
ALLARGS="$@"
|
||||||
if [[ -e $RCFILE ]]; then
|
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
||||||
info "[processing $RCFILE]"
|
notify "processing $RCFILE"
|
||||||
|
|
||||||
while read -r f ; do
|
while read -r f ; do
|
||||||
if [[ $f =~ ^auto\ || $f =~ ^at\ || $f =~ ^autotag\ ]]; then
|
if [[ $f =~ ^auto\ || $f =~ ^at\ || $f =~ ^autotag\ ]]; then
|
||||||
|
@ -162,67 +143,85 @@ if [[ -e $RCFILE ]]; then
|
||||||
ALLARGS="$f $ALLARGS"
|
ALLARGS="$f $ALLARGS"
|
||||||
fi
|
fi
|
||||||
done < <(egrep -v "(^#|^$)" $RCFILE)
|
done < <(egrep -v "(^#|^$)" $RCFILE)
|
||||||
|
ok
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
validargs="cdf:hlmMops:Ttv"
|
||||||
ARGS=$(getopt cdf:hlmMps:Ttv $ALLARGS)
|
while getopts "$validargs" i $ALLARGS; do
|
||||||
eval set -- $ARGS
|
|
||||||
for i do
|
|
||||||
case "$i" in
|
case "$i" in
|
||||||
-v)
|
v)
|
||||||
VERBOSE=1; shift 1;
|
VERBOSE=1;
|
||||||
info "[verbose mode]"
|
info "verbose mode"
|
||||||
;;
|
;;
|
||||||
-d)
|
d)
|
||||||
DUPLEXOPTS="-duplex"; shift 1;
|
DUPLEXOPTS="-duplex";
|
||||||
info "[duplex mode]"
|
info "duplex mode"
|
||||||
;;
|
;;
|
||||||
-h)
|
h)
|
||||||
usage; exit 1;
|
usage; exit 1;
|
||||||
;;
|
;;
|
||||||
-l)
|
l)
|
||||||
${SCANLINE} -list; exit 1;
|
${SCANLINE} -list; exit 1;
|
||||||
;;
|
;;
|
||||||
-m)
|
m)
|
||||||
MULTIPAGE=1; shift 1;
|
MULTIPAGE=1;
|
||||||
info "[multi-page mode enabled]"
|
info "multi-page mode enabled"
|
||||||
;;
|
;;
|
||||||
-M)
|
M)
|
||||||
shift 1;
|
inform "${CYAN}Manually mounting ^b${DIR}^p, enter password if prompted"
|
||||||
echo -ne "${CYAN}Manually mounting ${DIR}...$PLAIN"
|
|
||||||
mount_samba "${SHARE}" "${DIR}"
|
mount_samba "${SHARE}" "${DIR}"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
cecho $RED "failed"
|
error "Mount failed"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
cecho $GREEN "done"
|
inform "Mount complete"
|
||||||
exit 0
|
exit 0
|
||||||
;;
|
;;
|
||||||
-p)
|
o)
|
||||||
PREVIEW=1; shift 1;
|
OCR=1;
|
||||||
info "[preview enabled]"
|
|
||||||
;;
|
;;
|
||||||
-s)
|
O)
|
||||||
SCANNEROPTS="-scanner \"$2\""; shift 2;
|
OCRPROG="$OPTARG"
|
||||||
;;
|
;;
|
||||||
-T)
|
p)
|
||||||
info "[temp mode]"
|
PREVIEW=1;
|
||||||
TEMP=1; shift 1;
|
info "preview enabled"
|
||||||
;;
|
;;
|
||||||
-t)
|
s)
|
||||||
info "[test mode]"
|
SCANNEROPTS="-scanner \"$OPTARG\"";
|
||||||
TEST=1; shift 1;
|
;;
|
||||||
|
T)
|
||||||
|
info "temp mode"
|
||||||
|
TEMP=1;
|
||||||
|
;;
|
||||||
|
t)
|
||||||
|
info "test mode"
|
||||||
|
TEST=1;
|
||||||
;;
|
;;
|
||||||
-f)
|
-f)
|
||||||
info "[tag ${BOLD}$2${PLAIN}${BLUE} will use financial year paths]"
|
info "tag ^b$OPTARG^p will use financial year paths"
|
||||||
FINYEARTAGS="$FINYEARTAGS $2"
|
FINYEARTAGS="$FINYEARTAGS $OPTARG"
|
||||||
shift 2
|
|
||||||
;;
|
;;
|
||||||
--)
|
*)
|
||||||
shift
|
error "invalid argument: $i";
|
||||||
|
usage;
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
shift $((OPTIND - 1))
|
||||||
|
|
||||||
|
if [[ $OCR -eq 1 ]]; then
|
||||||
|
if [[ -x $OCRPROG ]]; then
|
||||||
|
info "OCR mode enabled"
|
||||||
|
OCR=1;
|
||||||
|
else
|
||||||
|
wtext="OCR requested but ^b$OCRPROG^p not found."
|
||||||
|
if [[ $OCRPROG == $OCRPROG_DEF ]]; then
|
||||||
|
wtext="${wtext} Use ^b-O^p to specify alternate binary."
|
||||||
|
fi
|
||||||
|
warn "$wtext"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $TEMP -eq 1 ]]; then
|
if [[ $TEMP -eq 1 ]]; then
|
||||||
DIR=/tmp
|
DIR=/tmp
|
||||||
|
@ -237,10 +236,9 @@ else
|
||||||
FILENAME=$1
|
FILENAME=$1
|
||||||
shift 1
|
shift 1
|
||||||
|
|
||||||
|
|
||||||
autotags "$FILENAME" # determine tags from filename
|
autotags "$FILENAME" # determine tags from filename
|
||||||
while [[ $# -ge 1 ]]; do
|
while [[ $# -ge 1 ]]; do
|
||||||
action "* Got tag '${BOLD}$1${PLAIN}${CYAN}' on command line."
|
inform "Got tag '^b$1^p' on command line."
|
||||||
if [[ -z $TAGS ]]; then
|
if [[ -z $TAGS ]]; then
|
||||||
TAGS="$1"
|
TAGS="$1"
|
||||||
else
|
else
|
||||||
|
@ -260,12 +258,12 @@ else
|
||||||
TAGS=`echo $TAGS | tr ' ' '\n' | sort -u | tr '\n' ' ' | sed -e 's/ $//'`
|
TAGS=`echo $TAGS | tr ' ' '\n' | sort -u | tr '\n' ' ' | sed -e 's/ $//'`
|
||||||
|
|
||||||
if [[ $TEST -eq 1 ]]; then
|
if [[ $TEST -eq 1 ]]; then
|
||||||
echo -e "Tags found: ${GREEN}${TAGS}${PLAIN}"
|
inform "Tags found: ${GREEN}${BOLD}${TAGS}^p"
|
||||||
echo -e "PDF will be written to ${BOLD}${DIR}/${GREEN}${FIRSTTAG}${PLAIN}${BOLD}/${FILENAME}.pdf${PLAIN}."
|
inform "PDF will be written to ^b${DIR}/${GREEN}${BOLD}${FIRSTTAG}^p^b/${FILENAME}.pdf."
|
||||||
echo -e "Symlinks will be created in:"
|
inform "Symlinks will be created in:"
|
||||||
for t in $TAGS; do
|
for t in $TAGS; do
|
||||||
if [[ $t != $FIRSTTAG ]]; then
|
if [[ $t != $FIRSTTAG ]]; then
|
||||||
echo -e " - ${DIR}/${GREEN}${t}${PLAIN}/${FILENAME}.pdf${PLAIN}"
|
inform " - ${DIR}/${GREEN}${t}^p${FILENAME}.pdf"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
exit 0
|
exit 0
|
||||||
|
@ -293,24 +291,22 @@ if [[ $TEMP -eq 0 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [[ $FILENAME == */* ]]; then
|
if [[ $FILENAME == */* ]]; then
|
||||||
error "Filename ${BOLD}$FILENAME${PLAIN}${RED} contains a slash - did you mix up filename and tags?"
|
error "Filename ^b$FILENAME^p contains a slash - did you mix up filename and tags?"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $FILENAME == *,* ]]; then
|
if [[ $FILENAME == *,* ]]; then
|
||||||
error "Filename ${BOLD}$FILENAME${PLAIN}${RED} is illegal - commas not allowed."
|
error "Filename ^b$FILENAME^p is illegal - commas not allowed."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# do the scan - capture output
|
# do the scan - capture output
|
||||||
#${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $*
|
#${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $*
|
||||||
|
|
||||||
#exec 5>&1
|
#exec 5>&1
|
||||||
#OUTPUT=$( ${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* 2>&1 |tee /dev/fd/5; exit ${PIPESTATUS[0]})
|
#OUTPUT=$( ${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* 2>&1 |tee /dev/fd/5; exit ${PIPESTATUS[0]})
|
||||||
#rv=$?
|
#rv=$?
|
||||||
|
|
||||||
|
|
||||||
finished=0
|
finished=0
|
||||||
NUMPAGES=0
|
NUMPAGES=0
|
||||||
TEMPFILE=`mktemp /tmp/scan.XXXXXX`
|
TEMPFILE=`mktemp /tmp/scan.XXXXXX`
|
||||||
|
@ -327,21 +323,20 @@ while [[ $finished -eq 0 ]]; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# scan new file
|
# scan new file
|
||||||
|
|
||||||
if [[ $VERBOSE -eq 1 ]]; then
|
if [[ $VERBOSE -eq 1 ]]; then
|
||||||
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
|
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
|
||||||
fi
|
fi
|
||||||
echo -e -n "${CYAN}Scanning..."
|
notify "Scanning..."
|
||||||
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
|
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
|
||||||
rv=$?
|
rv=$?
|
||||||
|
|
||||||
if [ $rv -eq 0 ]; then
|
if [ $rv -eq 0 ]; then
|
||||||
cecho $GREEN "done"
|
ok "done"
|
||||||
else
|
else
|
||||||
cecho $RED "failed"
|
fail
|
||||||
echo ""
|
echo ""
|
||||||
echo -e "${BOLD}${UNDERLINE}Full output:${PLAIN}"
|
csecho "$RED" "^bFull output:^p"
|
||||||
echo "$OUTPUT" | sed -e 's/^/ /'
|
csecho -e "$RED" "$OUTPUT" | sed -e 's/^/ /'
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
thisnpages=`printf %d $( echo "$OUTPUT" | egrep '^(Scan complete|didScanTo)' | wc -l )`
|
thisnpages=`printf %d $( echo "$OUTPUT" | egrep '^(Scan complete|didScanTo)' | wc -l )`
|
||||||
|
@ -358,7 +353,7 @@ while [[ $finished -eq 0 ]]; do
|
||||||
else
|
else
|
||||||
# multi-line mode
|
# multi-line mode
|
||||||
|
|
||||||
cecho $GREEN "Scanned + $thisnpages page(s) -> $NUMPAGES total."
|
csecho "$GREEN" "Scanned +^b$thisnpages^p page(s) -> ^b$NUMPAGES^p total."
|
||||||
if [[ $gotexisting -eq 1 ]]; then
|
if [[ $gotexisting -eq 1 ]]; then
|
||||||
# append newly-scanned page file on to rest of the pdf.
|
# append newly-scanned page file on to rest of the pdf.
|
||||||
pdftk ${TEMPFILE} ${PDFFILE} cat output ${TEMPFILE2}
|
pdftk ${TEMPFILE} ${PDFFILE} cat output ${TEMPFILE2}
|
||||||
|
@ -367,8 +362,7 @@ while [[ $finished -eq 0 ]]; do
|
||||||
rm -rf ${TEMPFILE}
|
rm -rf ${TEMPFILE}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -en "Insert next pages and press ${BOLD}ENTER${PLAIN}, or type '${BOLD}n${PLAIN}':${PLAIN}"
|
ask "Insert next pages and press ^bENTER^p, or type '^bn^p':" "y" yn
|
||||||
read -p " " yn
|
|
||||||
if [[ $yn == "n" ]]; then
|
if [[ $yn == "n" ]]; then
|
||||||
finished=1
|
finished=1
|
||||||
fi
|
fi
|
||||||
|
@ -391,7 +385,7 @@ if [[ $curmonth -ge 7 ]]; then
|
||||||
for t in $FINYEARTAGS; do
|
for t in $FINYEARTAGS; do
|
||||||
if [[ $x == */${t}/* ]]; then
|
if [[ $x == */${t}/* ]]; then
|
||||||
newname=`echo "$x" | sed -e "s,/${t}/$curyear/,/${t}/$nextyear/,"`
|
newname=`echo "$x" | sed -e "s,/${t}/$curyear/,/${t}/$nextyear/,"`
|
||||||
action "* Adjusting path for financial year: $x -> ${BOLD}$newname${PLAIN}"
|
inform "Adjusting path for financial year: $x -> ^b$newname^p"
|
||||||
mv -f "$x" "$newname"
|
mv -f "$x" "$newname"
|
||||||
PDFFILE=$(echo "$PDFFILE" | sed -e "s,${x},${newname},")
|
PDFFILE=$(echo "$PDFFILE" | sed -e "s,${x},${newname},")
|
||||||
fi
|
fi
|
||||||
|
@ -401,28 +395,45 @@ fi
|
||||||
|
|
||||||
PREVIEWFILE=""
|
PREVIEWFILE=""
|
||||||
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
|
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
|
||||||
echo -e -n "${GREEN}Scanned $NUMPAGES page(s) to "
|
itext="Scanned $NUMPAGES page(s) to "
|
||||||
count=1
|
count=1
|
||||||
for x in $PDFFILE; do
|
for x in $PDFFILE; do
|
||||||
if [[ $count -eq 1 ]]; then
|
if [[ $count -eq 1 ]]; then
|
||||||
PREVIEWFILE="$x"
|
PREVIEWFILE="$x"
|
||||||
echo -e -n "${BOLD}${GREEN}${x}${PLAIN}"
|
itext="${itext}^b${x}^p"
|
||||||
else
|
else
|
||||||
echo -e -n "${GREEN} + ${BOLD}${x}${PLAIN}"
|
itext="${itext} + ^b${x}^p"
|
||||||
fi
|
fi
|
||||||
count=$((count + 1))
|
count=$((count + 1))
|
||||||
done
|
done
|
||||||
echo -e "${PLAIN}"
|
inform "$itext"
|
||||||
else
|
else
|
||||||
PREVIEWFILE="$PDFFILE"
|
PREVIEWFILE="$PDFFILE"
|
||||||
cecho $GREEN "Scanned $NUMPAGES page(s) to ${BOLD}${PDFFILE}${PLAIN}"
|
inform "Scanned $NUMPAGES page(s) to ^b${PDFFILE}^p"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $OCR -eq 1 ]]; then
|
||||||
|
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
|
||||||
|
|
||||||
|
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
|
||||||
|
if [[ $? -eq 0 ]]; then
|
||||||
|
if [[ -x $P2TPROG ]]; then
|
||||||
|
words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c)
|
||||||
|
[[ $words -eq 1 ]] && ess="" || ess="s"
|
||||||
|
inform "OCR complete. Found ^b${word}^p word${ess}"
|
||||||
|
else
|
||||||
|
inform "OCR complete. Install pdftotext for word counts."
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error "OCR failed"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Put the full path onto the copy buffer
|
# Put the full path onto the copy buffer
|
||||||
echo -n "${PREVIEWFILE}" | pbcopy
|
echo -n "${PREVIEWFILE}" | pbcopy
|
||||||
|
|
||||||
if [ $PREVIEW -eq 1 ]; then
|
if [ $PREVIEW -eq 1 ]; then
|
||||||
action "Showing preview..."
|
inform "Showing preview..."
|
||||||
open "${PREVIEWFILE}"
|
open "${PREVIEWFILE}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue