scan/scan

#!/bin/bash

SCANLINE=/usr/local/bin/scanline

. ${HOME}/.bashtools/bashtools.sh
if [[ -z $HAVE_BASHTOOLS ]]; then
    echo "ERROR: bashtools not installed, download from https://git.nethack.net/rob/bashtools" >/dev/stderr
    exit 1
fi

function autotags() {
    local file idx
    file="$1"

    idx=0
    while [ $idx -lt $NAUTOTAGS ]; do
        if [[ $file == *${LOOKFOR[$idx]}* ]]; then
            if [[ -z $TAGS ]]; then
                TAGS="${ADDTAG[$idx]}"
            else
                TAGS="$TAGS ${ADDTAG[$idx]}"
            fi
            inform "* Inferred tag '^b${ADDTAG[$idx]}^p' from filename."
        fi
        idx=$((idx+1))
    done
}

function mount_local() { #  $1=mountpoint
    local  mydir
    mydir=$2

    # try an ls on case we have automount set up
    ls ${mydir}/ >/dev/null 2>&1
    is_mounted "$mydir"
    if [ $? -eq 0 ]; then
        return 0
    fi

    # otherwise, make sure it exists first
    if [[ ! -d ${mydir} ]]; then
        sudo mkdir ${mydir}
        sudo chown $USER:staff ${mydir}
    fi
    mount ${mydir}

    # check again...
    is_mounted "$mydir"
    if [ $? -ne 0 ]; then
        error "$mydir could not be mounted."
        sudo rmdir ${mydir}
        return 1
    fi
    return 0
}

function mount_samba() { #  $1=share  $2=mountpoint
    local myshare mydir
    myshare=$1
    mydir=$2

    sudo mkdir ${mydir}
    sudo chown $USER:staff ${mydir}
    # Make password prompt be cyan
    echo -e -n "$CYAN"
    mount_smbfs ${myshare} ${mydir}
    echo -e -n "$PLAIN"

    # check again...
    is_mounted "$mydir"
    if [ $? -ne 0 ]; then
        error "$mydir could not be mounted."
        sudo rmdir ${mydir}
        return 1
    fi
    return 0
}

function is_mounted() {
    local dir a b
    dir="$1"
    a=`stat -f %d "${dir}/."`
    b=`stat -f %d "${dir}/.."`
    if [[ $a == $b ]]; then
        return 1
    fi
    return 0
}

function usage() {
    echo "usage: $0 [OPTIONS] filename tag1 [tag2] [tag3] ... [tagX]"
    echo ""
    echo "       1. Mounts \$PDFSHARE to \$PDFDIR"
    echo "       2. Scans to:  \$PDFDIR/tag1/<year>/filename"
    echo "       3. Creates symlinks in:"
    echo "                  \$PDFDIR/tag2/<year>/filename"
    echo "                  \$PDFDIR/tag3/<year>/filename"
    echo "                  ...etc..."
    echo ""
    echo "    OPTIONS"
    echo "       -d      scan in duplex mode"
    echo "       -f xxx  for given tag, use financial year in path rather than calendar year"
    echo "       -h      show this text"
    echo "       -l      list all available scanners"
    echo "       -m      multi-page mode (prompts to load new pages each time)"
    echo "       -M      mount \$PDFSHARE to \$PDFDIR then exit"
    echo "       -o      use ocrmypdf to straighten document, extract text into pdf and use tags as keyword metadata"
    echo "       -O xxx  specify path to ocrmypdf"
    echo "       -p      preview document after scanning"
    echo "       -s xxx  select scanner to use"
    echo "       -T      Temporary mode - scan to /tmp/a.pdf"
    echo "       -v      verbose mode"
    echo ""
}

OCRPROG_DEF="/opt/homebrew/bin/ocrmypdf"
OCRPROG="$OCRPROG_DEF"
P2TPROG="pdftotext"
DUPLEXOPTS=""
MODE="scan"
DIR=${PDFDIR:-"~/Documents"}
SHARE=${PDFSHARE:-"//user@smbhost.example.net:/pdfs"}
SCANNEROPTS=""
MULTIPAGE=0
OCR=0
PREVIEW=0
RCFILE=${HOME}/.scanrc
TAGS=""
NAUTOTAGS=0
TEST=0
FINYEARTAGS=""
VERBOSE=0

ALLARGS="$@"
if [[ -e $RCFILE && $ALLARGS != *-h* ]]; then
    notify "processing $RCFILE"

    while read -r f ; do
        if [[ $f =~ ^auto\  || $f  =~ ^at\  || $f =~ ^autotag\  ]]; then
            thislookfor=`echo "$f" | awk '{ print $2 }'`
            thisaddtag=`echo "$f" | awk '{ print $3 }'`
            LOOKFOR[$NAUTOTAGS]="$thislookfor"
            ADDTAG[$NAUTOTAGS]="$thisaddtag"
            NAUTOTAGS=$(($NAUTOTAGS + 1))
        else
            ALLARGS="$f $ALLARGS"
        fi
    done < <(egrep -v "(^#|^$)" $RCFILE)
    ok
fi

validargs="cdf:hlmMops:Ttv"
while getopts "$validargs" i $ALLARGS; do
    case "$i" in
        v)
            VERBOSE=1;
            info "verbose mode"
            ;;
        d)
            DUPLEXOPTS="-duplex";
            info "duplex mode"
            ;;
        h)
            usage; exit 1;
            ;;
        l)
            ${SCANLINE} -list; exit 1;
            ;;
        m)
            MULTIPAGE=1;
            info "multi-page mode enabled"
            ;;
        M)
            inform "${CYAN}Manually mounting ^b${DIR}^p, enter password if prompted"
            mount_samba "${SHARE}" "${DIR}"
            if [ $? -ne 0 ]; then
                error "Mount failed"
                exit 1
            fi
            inform "Mount complete"
            exit 0
            ;;
        o)
            OCR=1;
            ;;
        O)
            OCRPROG="$OPTARG"
            ;;
        p)
            PREVIEW=1;
            info "preview enabled"
            ;;
        s)
            SCANNEROPTS="-scanner \"$OPTARG\"";
            ;;
        T)
            info "temp mode"
            TEMP=1;
            ;;
        t)
            info "test mode"
            TEST=1;
            ;;
        -f)
            info "tag ^b$OPTARG^p will use financial year paths"
            FINYEARTAGS="$FINYEARTAGS $OPTARG"
            ;;
        *)
            error "invalid argument:  $i";
            usage;
            ;;
    esac
done
shift $((OPTIND - 1))

if [[ $OCR -eq 1 ]]; then
    if [[ -x $OCRPROG ]]; then
        info "OCR mode enabled"
        OCR=1;
    else
        wtext="OCR requested but ^b$OCRPROG^p not found."
        if [[ $OCRPROG == $OCRPROG_DEF ]]; then
            wtext="${wtext}  Use ^b-O^p to specify alternate binary."
        fi
        warn "$wtext"
    fi
fi

if [[ $TEMP -eq 1 ]]; then
    DIR=/tmp
    FILENAME=a
else
    if [ $# -lt 1 ]; then
        usage
        exit 1
    fi

    # first arg is filename, rest are tags
    FILENAME=$1
    shift 1

    autotags "$FILENAME" # determine tags from filename
    while [[ $# -ge 1 ]]; do
        inform "Got tag '^b$1^p' on command line."
        if [[ -z $TAGS ]]; then
            TAGS="$1"
        else
            TAGS="$TAGS $1"
        fi
        shift 1
    done
    if [[ -z $TAGS ]]; then
        error "No tags specified or inferred from filename."
        exit 1
    fi

    # This will be the directory which scanline writes the pdf to
    FIRSTTAG=`echo $TAGS | awk '{ print $1 }'`

    # Remove duplicate tags
    TAGS=`echo $TAGS | tr ' ' '\n' | sort -u | tr '\n' ' ' | sed -e 's/ $//'`

    if [[ $TEST -eq 1 ]]; then
        inform "Tags found:  ${GREEN}${BOLD}${TAGS}^p"
        inform "PDF will be written to ^b${DIR}/${GREEN}${BOLD}${FIRSTTAG}^p^b/${FILENAME}.pdf."
        inform "Symlinks will be created in:"
        for t in $TAGS; do
            if [[ $t != $FIRSTTAG ]]; then
                inform "   - ${DIR}/${GREEN}${t}^p${FILENAME}.pdf"
            fi
        done
        exit 0
    fi
fi

if [[ $TEMP -eq 0 ]]; then
    # Check that target pdfs share is mounted
    if [[ $DIR =~ mnt|pdf ]]; then
        df -h ${DIR} 2>&1 | grep @ >/dev/null 2>&1
        is_mounted "$DIR"
        if [ $? -ne 0 ]; then
            info "$DIR not mounted - trying to mount it..."
            #mount_samba "${SHARE}" "${DIR}"
            mount_local "${DIR}"
            if [ $? -ne 0 ]; then
                exit 1
            fi
        fi
    fi

    # Check that we didn't mix up the filename and tags
    if ! [ -d ${DIR}/${FIRSTTAG} ] ; then
        error "$DIR/$FIRSTTAG doesn't exist, did you mix up filename and tags?"
        exit 1
    fi
    if [[ $FILENAME == */* ]]; then
        error "Filename ^b$FILENAME^p contains a slash - did you mix up filename and tags?"
        exit 1
    fi

    if [[ $FILENAME == *,* ]]; then
        error "Filename ^b$FILENAME^p is illegal - commas not allowed."
        exit 1
    fi
fi

# do the scan  - capture output
#${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $*
#exec 5>&1
#OUTPUT=$( ${SCANLINE} -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $* 2>&1 |tee /dev/fd/5; exit ${PIPESTATUS[0]})
#rv=$?

finished=0
NUMPAGES=0
TEMPFILE=`mktemp /tmp/scan.XXXXXX`
TEMPFILE2=`mktemp /tmp/scan.XXXXXX`
PDFFILE=""
while [[ $finished -eq 0 ]]; do
    gotexisting=0
    if [[ $MULTIPAGE -eq 1 ]]; then
        if ! [[ -z $PDFFILE ]]; then
            # move already scanned pages out of the way
            mv ${PDFFILE} ${TEMPFILE} 2>/dev/null
            gotexisting=1
        fi
    fi

    # scan new file
    if [[ $VERBOSE -eq 1 ]]; then
        info "will run:  ${SCANLINE} -verbose -dir \"${DIR}\" -name \"${FILENAME}\" \"$DUPLEXOPTS\" \"$SCANNEROPTS\" $TAGS 2>&1"
    fi
    notify "Scanning..."
    OUTPUT=$( ${SCANLINE} -verbose -dir "${DIR}" -name "${FILENAME}" "$DUPLEXOPTS" "$SCANNEROPTS" $TAGS 2>&1)
    rv=$?

    if [ $rv -eq 0 ]; then
        ok "done"
    else
        fail
        echo ""
        csecho "$RED" "^bFull output:^p"
        csecho -e "$RED" "$OUTPUT" | sed -e 's/^/   /'
        exit 1
    fi
    thisnpages=`printf %d $( echo "$OUTPUT" | egrep '^(Scan complete|didScanTo)' | wc -l )`
    NUMPAGES=$(( $NUMPAGES + $thisnpages ))

    PDFFILE=`echo "$OUTPUT" | egrep '(^About to)|to:' | awk '{ print $NF }'`
    if [[ $VERBOSE -eq 1 ]]; then
        echo "output is: [$OUTPUT]"
        echo "PDFFILE is: [$PDFFILE]"
    fi

    if [[ $MULTIPAGE -eq 0 ]]; then
        finished=1
    else
        # multi-line mode

        csecho "$GREEN" "Scanned +^b$thisnpages^p page(s) -> ^b$NUMPAGES^p total."
        if [[ $gotexisting -eq 1 ]]; then
            # append newly-scanned page file on to rest of the pdf.
            pdftk ${TEMPFILE} ${PDFFILE} cat output ${TEMPFILE2}
            mv ${TEMPFILE2} "${PDFFILE}"
            # remove newly-scanned page file
            rm -rf ${TEMPFILE}
        fi

        ask "Insert next pages and press ^bENTER^p, or type '^bn^p':" "y" yn
        if [[ $yn == "n" ]]; then
            finished=1
        fi
    fi
done

if ! [[ -z $TEMPFILE ]]; then
    if [[ -e $TEMPFILE ]]; then
        # should never happen...
        rm -f ${TEMPFILE}
    fi
fi

# Adjust year for financial year if required.
curyear=$(date +%Y)
curmonth=$(date +%m | bc)
nextyear=$((curyear + 1))
if [[ $curmonth -ge 7 ]]; then
    for x in $PDFFILE; do
        for t in $FINYEARTAGS; do
            if [[ $x == */${t}/* ]]; then
                newname=`echo "$x" | sed -e "s,/${t}/$curyear/,/${t}/$nextyear/,"`
                inform "Adjusting path for financial year: $x -> ^b$newname^p"
                mv -f "$x" "$newname"
                PDFFILE=$(echo "$PDFFILE" | sed -e "s,${x},${newname},")
            fi
        done
    done
fi

PREVIEWFILE=""
if [[ $PDFFILE == *" "* || $PDFFILE == *$'\n'* ]]; then
    itext="Scanned $NUMPAGES page(s) to "
    count=1
    for x in $PDFFILE; do
        if [[ $count -eq 1 ]]; then
            PREVIEWFILE="$x"
            itext="${itext}^b${x}^p"
        else
            itext="${itext} + ^b${x}^p"
        fi
        count=$((count + 1))
    done
    inform "$itext"
else
    PREVIEWFILE="$PDFFILE"
    inform "Scanned $NUMPAGES page(s) to ^b${PDFFILE}^p"
fi

if [[ $OCR -eq 1 ]]; then
    OCROPTS="--output-type pdfa -r -d -q --author scan.sh --keywords '$TAGS'"

    $OCRPROG $OCROPTS "${PREVIEWFILE}" "${PREVIEWFILE}" >/dev/null 2>&1
    if [[ $? -eq 0 ]]; then
        if [[ -x $P2TPROG ]]; then
            words=$($P2TPROG "${PREVIEWFILE}" /dev/stdout | wc -c)
            [[ $words -eq 1 ]] && ess="" || ess="s"
            inform "OCR complete.  Found ^b${word}^p word${ess}"
        else
            inform "OCR complete.  Install pdftotext for word counts."
        fi
    else
        error "OCR failed"
    fi
fi

# Put the full path onto the copy buffer
echo -n "${PREVIEWFILE}" | pbcopy

if [ $PREVIEW -eq 1 ]; then
    inform "Showing preview..."
    open "${PREVIEWFILE}"
fi