467 lines
12 KiB
Bash
Executable File
467 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
SCANLINE=/usr/local/bin/scanline
|
|
|
|
. ${HOME}/.bashtools/bashtools.sh
|
|
if [[ -z $HAVE_BASHTOOLS ]]; then
|
|
echo "ERROR: bashtools not installed, download from https://git.nethack.net/rob/bashtools" >/dev/stderr
|
|
exit 1
|
|
fi
|
|
|
|
function autotags() {
|
|
local file idx
|
|
file="$1"
|
|
|
|
idx=0
|
|
while [ $idx -lt $NAUTOTAGS ]; do
|
|
if [[ $file == *${LOOKFOR[$idx]}* ]]; then
|
|
if [[ -z $TAGS ]]; then
|
|
TAGS="${ADDTAG[$idx]}"
|
|
else
|
|
TAGS="$TAGS ${ADDTAG[$idx]}"
|
|
fi
|
|
inform "Inferred tag '^b${ADDTAG[$idx]}^p' from filename."
|
|
fi
|
|
idx=$((idx+1))
|
|
done
|
|
}
|
|
|
|
function mount_local() { # $1=mountpoint
|
|
local mydir
|
|
mydir=$2
|
|
|
|
# try an ls on case we have automount set up
|
|
ls ${mydir}/ >/dev/null 2>&1
|
|
is_mounted "$mydir"
|
|
if [ $? -eq 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
# otherwise, make sure it exists first
|
|
if [[ ! -d ${mydir} ]]; then
|
|
sudo mkdir ${mydir}
|
|
sudo chown $USER:staff ${mydir}
|
|
fi
|
|
mount ${mydir}
|
|
|
|
# check again...
|
|
is_mounted "$mydir"
|
|
if [ $? -ne 0 ]; then
|
|
error "$mydir could not be mounted."
|
|
sudo rmdir ${mydir}
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
function mount_samba() { # $1=share $2=mountpoint
|
|
local myshare mydir
|
|
myshare=$1
|
|
mydir=$2
|
|
|
|
sudo mkdir ${mydir}
|
|
sudo chown $USER:staff ${mydir}
|
|
# Make password prompt be cyan
|
|
echo -e -n "$CYAN"
|
|
mount_smbfs ${myshare} ${mydir}
|
|
echo -e -n "$PLAIN"
|
|
|
|
# check again...
|
|
is_mounted "$mydir"
|
|
if [ $? -ne 0 ]; then
|
|
error "$mydir could not be mounted."
|
|
sudo rmdir ${mydir}
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
function is_mounted() {
|
|
local dir a b
|
|
dir="$1"
|
|
a=`stat -f %d "${dir}/."`
|
|
b=`stat -f %d "${dir}/.."`
|
|
if [[ $a == $b ]]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
function usage() {
|
|
echo "usage: $0 [OPTIONS] filename tag1 [tag2] [tag3] ... [tagX]"
|
|
echo ""
|
|
echo " 1. Mounts \$PDFSHARE to \$PDFDIR"
|
|
echo " 2. Scans to: \$PDFDIR/tag1/<year>/filename"
|
|
echo " 3. Creates symlinks in:"
|
|
echo " \$PDFDIR/tag2/<year>/filename"
|
|
echo " \$PDFDIR/tag3/<year>/filename"
|
|
echo " ...etc..."
|
|
echo ""
|
|
echo " OPTIONS"
|
|
echo " -d scan in duplex mode"
|
|
echo " -f xxx for given tag, use financial year in path rather than calendar year"
|
|
echo " -h show this text"
|
|
echo " -l list all available scanners"
|
|
echo " -m multi-page mode (prompts to load new pages each time)"
|
|
echo " -M mount \$PDFSHARE to \$PDFDIR then exit"
|
|
echo " -o use ocrmypdf to straighten document, extract text into pdf and use tags as keyword metadata"
|
|
echo " -O xxx specify path to ocrmypdf"
|
|
echo " -p preview document after scanning"
|
|
echo " -s xxx select scanner to use"
|
|
echo " -T Temporary mode - scan to /tmp/a.pdf"
|
|
echo " -v verbose mode"
|
|
echo ""
|
|
}
|
|
|
|
OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
|
|
OCRPROG="$OCRPROG_DEF"
|
|
P2TPROG="/opt/homebrew/bin/pdftotext"
|
|
DUPLEXOPTS=""
|
|
MODE="scan"
|
|
DIR=${PDFDIR:-"~/Documents"}
|
|
SHARE=${PDFSHARE:-"//user@smbhost.example.net:/pdfs"}
|
|
SCANNEROPTS=""
|
|
MULTIPAGE=0
|
|
OCR=0
|
|
PREVIEW=0
|
|
RCFILE=${HOME}/.scanrc
|
|
TAGS=""
|
|
NAUTOTAGS=0
|
|
TEST=0
|
|
FINYEARTAGS=""
|
|
VERBOSE=0
|
|
|
|
ALLARGS=""
|
|
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
|
|
notify "processing $RCFILE"
|
|
|
|
while read -r f ; do
|
|
if [[ $f =~ ^auto\ || $f =~ ^at\ || $f =~ ^autotag\ ]]; then
|
|
thislookfor=`echo "$f" | awk '{ print $2 }'`
|
|
thisaddtag=`echo "$f" | awk '{ print $3 }'`
|
|
LOOKFOR[$NAUTOTAGS]="$thislookfor"
|
|
ADDTAG[$NAUTOTAGS]="$thisaddtag"
|
|
NAUTOTAGS=$(($NAUTOTAGS + 1))
|
|
else
|
|
ALLARGS="$f $ALLARGS"
|
|
fi
|
|
done < <(egrep -v "(^#|^$)" $RCFILE)
|
|
ok
|
|
fi
|
|
|
|
function processarg() {
|
|
local i=$1
|
|
local arg=$2
|
|
|
|
case "$i" in
|
|
v)
|
|
VERBOSE=1;
|
|
info "verbose mode"
|
|
;;
|
|
d)
|
|
DUPLEXOPTS="-duplex";
|
|
info "duplex mode"
|
|
;;
|
|
h)
|
|
usage; exit 1;
|
|
;;
|
|
l)
|
|
${SCANLINE} -list; exit 1;
|
|
;;
|
|
m)
|
|
MULTIPAGE=1;
|
|
info "multi-page mode enabled"
|
|
;;
|
|
M)
|
|
inform "${CYAN}Manually mounting ^b${DIR}^p, enter password if prompted"
|
|
mount_samba "${SHARE}" "${DIR}"
|
|
if [ $? -ne 0 ]; then
|
|
error "Mount failed"
|
|
exit 1
|
|
fi
|
|
inform "Mount complete"
|
|
exit 0
|
|
;;
|
|
o)
|
|
OCR=1;
|
|
info "OCR mode enabled"
|
|
;;
|
|
O)
|
|
OCRPROG="$arg"
|
|
;;
|
|
p)
|
|
PREVIEW=1;
|
|
info "preview enabled"
|
|
;;
|
|
s)
|
|
SCANNEROPTS="-scanner \"$arg\"";
|
|
;;
|
|
T)
|
|
info "temp mode"
|
|
TEMP=1;
|
|
;;
|
|
t)
|
|
info "test mode"
|
|
TEST=1;
|
|
;;
|
|
f)
|
|
info "tag ^b$arg^p will use financial year paths"
|
|
FINYEARTAGS="$FINYEARTAGS $arg"
|
|
;;
|
|
*)
|
|
error "invalid argument: $i";
|
|
usage;
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
}
|
|
|
|
validargs="cdf:hlmMops:Ttv"
|
|
while getopts "$validargs" i $ALLARGS; do
|
|
processarg "$i" "$OPTARG"
|
|
done
|
|
unset OPTIND
|
|
|
|
while getopts "$validargs" i $*; do
|
|
processarg "$i" "$OPTARG"
|
|
done
|
|
shift $((OPTIND - 1))
|
|
|
|
if [[ $OCR -eq 1 ]]; then
|
|
if [[ -x $OCRPROG ]]; then
|
|
OCR=1;
|
|
else
|
|
wtext="OCR requested but ^b$OCRPROG^p not found."
|
|
if [[ $OCRPROG == $OCRPROG_DEF ]]; then
|
|
wtext="${wtext} Use ^b-O^p to specify alternate binary."
|
|
fi
|
|
warn "$wtext"
|
|
fi
|
|
fi
|
|
|
|
if [[ $TEMP -eq 1 ]]; then
|
|
DIR=/tmp
|
|
FILENAME=a
|
|
else
|
|
if [ $# -lt 1 ]; then
|
|
usage
|
|
exit 1
|
|
fi
|
|
|
|
# first arg is filename, rest are tags
|
|
FILENAME=$1
|
|
shift 1
|
|
|
|
autotags "$FILENAME" # determine tags from filename
|
|
while [[ $# -ge 1 ]]; do
|
|
inform "Got tag '^b$1^p' on command line."
|
|
if [[ -z $TAGS ]]; then
|
|
TAGS="$1"
|
|
else
|
|
TAGS="$TAGS $1"
|
|
fi
|
|
shift 1
|
|
done
|
|
if [[ -z $TAGS ]]; then
|
|
error "No tags specified or inferred from filename."
|
|
exit 1
|
|
fi
|
|
|
|
# This will be the directory which scanline writes the pdf to
|
|
FIRSTTAG=`echo $TAGS | awk '{ print $1 }'`
|
|
|
|
# Remove duplicate tags
|
|
TAGS=`echo $TAGS | tr ' ' '\n' | sort -u | tr '\n' ' ' | sed -e 's/ $//'`
|
|
|
|
if [[ $TEST -eq 1 ]]; then
|
|
inform "Tags found: ${GREEN}${BOLD}${TAGS}^p"
|
|
inform "PDF will be written to ^b${DIR}/${GREEN}${BOLD}${FIRSTTAG}^p^b/${FILENAME}.pdf."
|
|
inform "Symlinks will be created in:"
|
|
for t in $TAGS; do
|
|
if [[ $t != $FIRSTTAG ]]; then
|
|
inform " - ${DIR}/${GREEN}${t}^p${FILENAME}.pdf"
|
|
fi
|
|
done
|
|
exit 0
|
|
fi
|
|
fi
|
|
|
|
if [[ $TEMP -eq 0 ]]; then
|
|
# Check that target pdfs share is mounted
|
|
if [[ $DIR =~ mnt|pdf ]]; then
|
|
df -h ${DIR} 2>&1 | grep @ >/dev/null 2>&1
|
|
is_mounted "$DIR"
|
|
if [ $? -ne 0 ]; then
|
|
info "$DIR not mounted - trying to mount it..."
|
|
#mount_samba "${SHARE}" "${DIR}"
|
|
mount_local "${DIR}"
|
|
if [ $? -ne 0 ]; then
|
|
exit 1
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Check that we didn't mix up the filename and tags
|
|
if ! [ -d ${DIR}/${FIRSTTAG} ] ; then
|
|
error "$DIR/$FIRSTTAG doesn't exist, did you mix up filename and tags?"
|
|
exit 1
|
|
fi
|
|
if [[ $FILENAME == */* ]]; then
|
|
error "Filename ^b$FILENAME^p contains a slash - did you mix up filename and tags?"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ $FILENAME == *,* ]]; then
|
|
error "Filename ^b$FILENAME^p is illegal - commas not allowed."
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# do the scan - capture output
|
|
#${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $*
|
|
#exec 5>&1
|
|
#OUTPUT=$( ${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* 2>&1 |tee /dev/fd/5; exit ${PIPESTATUS[0]})
|
|
#rv=$?
|
|
|
|
finished=0
|
|
NUMPAGES=0
|
|
TEMPFILE=`mktemp /tmp/scan.XXXXXX`
|
|
TEMPFILE2=`mktemp /tmp/scan.XXXXXX`
|
|
PDFFILE=""
|
|
while [[ $finished -eq 0 ]]; do
|
|
gotexisting=0
|
|
if [[ $MULTIPAGE -eq 1 ]]; then
|
|
if ! [[ -z $PDFFILE ]]; then
|
|
# move already scanned pages out of the way
|
|
mv ${PDFFILE} ${TEMPFILE} 2>/dev/null
|
|
gotexisting=1
|
|
fi
|
|
fi
|
|
|
|
# scan new file
|
|
if [[ $VERBOSE -eq 1 ]]; then
|
|
info "will run: ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
|
|
fi
|
|
notify "Scanning"
|
|
OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
|
|
rv=$?
|
|
|
|
if [ $rv -eq 0 ]; then
|
|
ok "done"
|
|
else
|
|
fail
|
|
echo ""
|
|
csecho "$RED" "^bFull output:^p"
|
|
csecho -e "$RED" "$OUTPUT" | sed -e 's/^/ /'
|
|
exit 1
|
|
fi
|
|
thisnpages=`printf %d $( echo "$OUTPUT" | egrep '^(Scan complete|didScanTo)' | wc -l )`
|
|
NUMPAGES=$(( $NUMPAGES + $thisnpages ))
|
|
|
|
PDFFILE=`echo "$OUTPUT" | egrep '(^About to)|to:' | awk '{ print $NF }'`
|
|
if [[ $VERBOSE -eq 1 ]]; then
|
|
echo "output is: [$OUTPUT]"
|
|
echo "PDFFILE is: [$PDFFILE]"
|
|
fi
|
|
|
|
if [[ $MULTIPAGE -eq 0 ]]; then
|
|
finished=1
|
|
else
|
|
# multi-line mode
|
|
|
|
csecho "$GREEN" "Scanned +^b$thisnpages^p page(s) -> ^b$NUMPAGES^p total."
|
|
if [[ $gotexisting -eq 1 ]]; then
|
|
# append newly-scanned page file on to rest of the pdf.
|
|
pdftk ${TEMPFILE} ${PDFFILE} cat output ${TEMPFILE2}
|
|
mv ${TEMPFILE2} "${PDFFILE}"
|
|
# remove newly-scanned page file
|
|
rm -rf ${TEMPFILE}
|
|
fi
|
|
|
|
ask "Insert next pages and press ^bENTER^p, or type '^bn^p':" "y" yn
|
|
if [[ $yn == "n" ]]; then
|
|
finished=1
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if ! [[ -z $TEMPFILE ]]; then
|
|
if [[ -e $TEMPFILE ]]; then
|
|
# should never happen...
|
|
rm -f ${TEMPFILE}
|
|
fi
|
|
fi
|
|
|
|
# Adjust year for financial year if required.
|
|
curyear=$(date +%Y)
|
|
curmonth=$(date +%m | bc)
|
|
nextyear=$((curyear + 1))
|
|
if [[ $curmonth -ge 7 ]]; then
|
|
for x in $PDFFILE; do
|
|
for t in $FINYEARTAGS; do
|
|
if [[ $x == */${t}/* ]]; then
|
|
newname=`echo "$x" | sed -e "s,/${t}/$curyear/,/${t}/$nextyear/,"`
|
|
inform "Adjusting path for financial year: $x -> ^b$newname^p"
|
|
mv -f "$x" "$newname"
|
|
PDFFILE=$(echo "$PDFFILE" | sed -e "s,${x},${newname},")
|
|
fi
|
|
done
|
|
done
|
|
fi
|
|
|
|
PREVIEWFILE=""
|
|
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
|
|
itext="Scanned ^b$NUMPAGES^p page(s) to "
|
|
count=1
|
|
for x in $PDFFILE; do
|
|
if [[ $count -eq 1 ]]; then
|
|
PREVIEWFILE="$x"
|
|
itext="${itext}^b${x}^p"
|
|
else
|
|
itext="${itext} + ^b${x}^p"
|
|
fi
|
|
count=$((count + 1))
|
|
done
|
|
inform "$itext"
|
|
else
|
|
PREVIEWFILE="$PDFFILE"
|
|
inform "Scanned $NUMPAGES page(s) to ^b${PDFFILE}^p"
|
|
fi
|
|
|
|
if [[ $OCR -eq 1 ]]; then
|
|
notify "Running OCR"
|
|
OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"
|
|
|
|
$OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
|
|
if [[ $? -eq 0 ]]; then
|
|
ok
|
|
if [[ -x $P2TPROG ]]; then
|
|
allwords=$($P2TPROG "${PREVIEWFILE}" /dev/stdout)
|
|
words=$(echo "$allwords" | wc -w | bc)
|
|
mostfreq=$(echo "$allwords" | tr ' ' '\n' 2>/dev/null | tr -dc 'A-Za-z\n' 2>/dev/null | awk NF | egrep '[a-z]' | sort | uniq -c | sort -n | tail -1)
|
|
mostfreq_word=$(echo "$mostfreq" | awk '{ print $2 }')
|
|
mostfreq_count=$(echo "$mostfreq" | awk '{ print $1 }')
|
|
[[ $words -eq 1 ]] && ess="" || ess="s"
|
|
[[ $mostfreq_count -eq 1 ]] && occ_ess="" || occ_ess="s"
|
|
|
|
inform "Found ^b${words}^p word${ess}, most frequent was ^b${mostfreq_word}^p with ^b${mostfreq_count}^p occurence${occ_ess}."
|
|
echo "$allwords" >/tmp/a
|
|
else
|
|
warn "pdftotext not installed - skipping word count"
|
|
fi
|
|
else
|
|
fail
|
|
fi
|
|
fi
|
|
|
|
# Put the full path onto the copy buffer
|
|
echo -n "${PREVIEWFILE}" | pbcopy
|
|
|
|
if [ $PREVIEW -eq 1 ]; then
|
|
inform "Showing preview..."
|
|
open "${PREVIEWFILE}"
|
|
fi
|
|
|
|
|