#!/usr/bin/env bash # format: # # url1 start_regexp1 end_regexp1 # ntempfiles=0 DEFAULT_SEP="@" DEFAULT_OUTDIR="${HOME}/.tostracker/output" DEFAULT_CONFIG="${HOME}/.tostracker/config" FORMAT="%-32s %s\n" function usage() { echo "usage: $0 OPTIONS" echo echo "OPTIONS:" echo " -c filename Use given config file instead of default (./config)" echo " -F char Use given character as a field separator in config file instead of default (${SEP})" echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'" echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'" echo " -l List configured sites then exit" echo " -L Use lynx to render html" echo " -o dirname Use given output dir instead of default (.)" echo " -t sitename Just output raw content of given site, useful for finding start/end regexps." echo " -T sitename Just output content of given site between re_start and re_end regexps." echo echo "" } function newtempfile() { local fn fn=$(mktemp /tmp/tostracker.XXXXXX) if [[ $? -ne 0 ]]; then echo "failed to create temp file" >&2 exit 1 fi tempfile[$ntempfiles]="$fn" ntempfiles=$(($ntempfiles + 1)) echo "$fn" } function cleanup() { local x for x in ${!tempfile[@]}; do [[ -e ${tempfile[$x]} ]] && rm -f "${tempfile[$x]}" done } function addsite() { site_name[$nsites]="$1" site_url[$nsites]="$2" site_re_start[$nsites]="$3" site_re_end[$nsites]="$4" nsites=$((nsites + 1)) } function fatal() { echo -e "FATAL: $*" >&2 exit 1 } function showsite() { #1=sitename local x lookfor="$1" idx=-1 [[ -z $lookfor ]] && return 1 for x in ${!site_name[@]}; do if [[ ${site_name[$x]} == "$lookfor" ]]; then idx=${x} break fi done if [[ $idx == -1 ]]; then return 1 fi showsite_byidx "$idx" } function showsite_byidx() { #1=idx local idx="$1" printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}" } function getcontent() { # 1=url 2=outputfile local url="$1" local outfile="$2" if [[ -n $LYNX ]]; then curl -sL "$url" | ${LYNX} -stdin -dump -list_inline > "${outfile}" else curl -sL "$url" | ${SED} 's/>/>\n/g;s/$" > "${outfile}" fi } trap cleanup EXIT TERM # Defaults OUTDIR="${DEFAULT_OUTDIR}" CONFIG="${DEFAULT_CONFIG}" SEP="${DEFAULT_SEP}" DOGITCOMMIT=0 DOGITPUSH=0 MODE="normal" TESTSITE="" TEST_USERES=0 LYNX="" ARGS="hc:F:g:lLo:t:T:" while getopts "$ARGS" i; do case "$i" in h) usage; exit 1; ;; g) if [[ $OPTARG == "c" ]]; then DOGITCOMMIT=1 DOGITPUSH=0 elif [[ $OPTARG == "p" ]]; then DOGITCOMMIT=1 DOGITPUSH=1 else fatal "invalid git subargument - must use -gc or -gp" fi ;; l) MODE="list" ;; L) LYNX=$(type -p lynx 2>/dev/null) [[ $? -ne 0 ]] && fatal "lynx not found in path" ;; o) OUTDIR="$OPTARG" ;; c) CONFIG="$OPTARG" ;; F) SEP="$OPTARG" ;; t) MODE="test" TESTSITE="$OPTARG" TEST_USERES=0 ;; T) MODE="test" TESTSITE="$OPTARG" TEST_USERES=1 ;; *) echo "invalid option '$i'" >&2 usage; exit 1; ;; esac done shift $((OPTIND - 1)) if [[ $# -ne 0 ]]; then usage exit 1 fi SED=$(which gsed 2>/dev/null) [[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null) [[ -z $SED ]] && fatal "can't find sed in path" # Read config file if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file" fi wantfields=4 while read line; do [[ -z $line ]] && continue [[ $line =~ ^# ]] && continue nfields=$(echo "1 + $(grep -o @ <<<"$line" | wc -l)" | bc) if [[ $nfields -ne $wantfields ]]; then fatal "wrong number of fields in config line (want $wantfields, got $nfields):\n$line\n" fi sitename=$(awk -F "$SEP" '{ print $1 }' <<<"$line") url=$(awk -F "$SEP" '{ print $2 }' <<<"$line") re_start="$(awk -F "$SEP" '{ print $3 }' <<<"$line")" re_end="$(awk -F "$SEP" '{ print $4 }' <<<"$line")" addsite "$sitename" "$url" "$re_start" "$re_end" #echo "sitename: [$sitename]" #echo "url: [$url]" #echo "start: [$re_start]" #echo "end: [$re_end]" done < ${CONFIG} if [[ $MODE == "list" ]]; then printf "$FORMAT" "Site Name" "URL" for x in ${!site_name[@]}; do showsite_byidx "$x" done exit 0 elif [[ $MODE == "test" ]]; then if [[ -n $TESTSITE ]]; then idx=-1 for x in ${!site_name[@]}; do if [[ ${site_name[$x]} == $TESTSITE ]]; then idx="$x" break fi done if [[ $idx == -1 ]]; then fatal "Requested site '$TESTSITE' not found in config file ($CONFIG)." fi url="${site_url[$idx]}" temp=$(newtempfile) getcontent "$url" "$temp" if [[ $TEST_USERES -eq 1 ]]; then sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p" cat "$temp" | ${SED} -n "$sedcmd" else cat "$temp" fi rm -f "$temp" exit $? else fatal "Test mode in use, but \$TESTSITE is empty." fi fi # confirm output dir if [[ -e ${OUTPUT} ]]; then if [[ ! -d ${OUTDIR} ]]; then fatal "output directory '$OUTDIR' already exists but is not a directory" fi else mkdir -p "${OUTDIR}" fi if [[ $DOGITPUSH -eq 1 && $DOGITCOMMIT -ne 1 ]]; then fatal "git push option cannot be used without git commit option" fi if [[ $DOGITCOMMIT -eq 1 ]]; then if [[ ! -d ${OUTDIR}/.git ]]; then fatal "error - git commit option used, but output directory '$OUTDIR' doesn't seem to be a git repo" fi fi if [[ $DOGITPUSH -eq 1 ]]; then grep -q "\[remote" ${OUTDIR}/.git/config 2>/dev/null if [[ $? -ne 0 ]]; then fatal "git push option used, but output git repo '$OUTDIR' doesn't have any remotes" fi fi filesdone="" sitesdone="" summary="" for x in ${!site_name[@]}; do sitename="${site_name[$x]}" url="${site_url[$x]}" re_start="${site_re_start[$x]}" re_end="${site_re_end[$x]}" if [[ -z $re_start || -z $re_end ]]; then # skip processing continue fi thisfile="${sitename}.txt" outfile="${OUTDIR}/${thisfile}" temp=$(newtempfile) temp2=$(newtempfile) getcontent "$url" "$temp" sedcmd="/${re_start}/,/${re_end}/p" echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2} cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2} if [[ -e ${outfile} ]]; then # has it changed? diff "$temp2" "$outfile" &>/dev/null [[ $? -eq 0 ]] && changed=0 || changed=1 else changed=1 fi if [[ $changed -eq 1 ]]; then text="UPDATES FOUND" else text="no change" fi cp -a "${temp2}" "${outfile}" echo "Scraped '${sitename}' to '${outfile}' ($text)" rm -f "${temp}" "${temp2}" if [[ $changed -eq 1 ]]; then sitesdone="$sitesdone $sitename" filesdone="$filesdone $thisfile" if [[ -z $summary ]]; then summary="$(showsite $sitename)" else summary=$(echo -e "$summary\n$(showsite $sitename)") fi fi done if [[ $DOGITCOMMIT -eq 1 ]]; then if [[ -n $filesdone ]]; then echo -n "Doing git add for [${filesdone}]..." >&2 msg="Policies have been updated for the following sites:" msg=$(echo -e "Policies have been updated for the following sites:\n$summary") pushd "${OUTDIR}" &>/dev/null fd=( $filesdone ) res=$(git add ${fd[@]} 2>&1) [[ $? -eq 0 ]] && echo "ok" >&2 || { echo "failed" >&2; fatal "git add failed:\n$res\n"; } echo -n "Doing git commit..." >&2 res=$(git commit -m "$msg" 2>&1) if [[ $? -eq 0 ]]; then echo "ok" >&2 elif [[ $res == *"othing to commit"* ]]; then echo "ok (no changes)" >&2 else echo "failed" >&2 fatal "git commit failed:\n$res\n" fi if [[ $DOGITPUSH -eq 1 ]]; then echo -n "Doing git push..." >&2 res=$(git push 2>&1) [[ $? -eq 0 ]] && echo "ok" >&2 || { echo "failed" >&2; fatal "git push failed:\n$res\n"; } fi popd &>/dev/null fi fi exit 0