2023-12-01 14:53:12 +11:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
# format:
|
|
|
|
#
|
|
|
|
# url1 start_regexp1 end_regexp1
|
|
|
|
#
|
|
|
|
ntempfiles=0
|
|
|
|
DEFAULT_SEP="@"
|
|
|
|
DEFAULT_OUTDIR="${HOME}/.tostracker/output"
|
|
|
|
DEFAULT_CONFIG="${HOME}/.tostracker/config"
|
|
|
|
FORMAT="%-32s %s\n"
|
|
|
|
|
|
|
|
|
|
|
|
function usage() {
|
|
|
|
echo "usage: $0 OPTIONS"
|
|
|
|
echo
|
|
|
|
echo "OPTIONS:"
|
|
|
|
echo " -c filename Use given config file instead of default (./config)"
|
|
|
|
echo " -F char Use given character as a field separator in config file instead of default (${SEP})"
|
|
|
|
echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'"
|
|
|
|
echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'"
|
|
|
|
echo " -o dirname Use given output dir instead of default (.)"
|
|
|
|
echo " -t sitename Just output raw content of given site, useful for finding start/end regexps."
|
|
|
|
echo " -T sitename Just output content of given site between re_start and re_end regexps."
|
|
|
|
echo
|
|
|
|
echo ""
|
|
|
|
}
|
|
|
|
|
|
|
|
function newtempfile() {
|
|
|
|
local fn
|
|
|
|
fn=$(mktemp /tmp/tostracker.XXXXXX)
|
|
|
|
if [[ $? -ne 0 ]]; then
|
|
|
|
echo "failed to create temp file" >&2
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
tempfile[$ntempfiles]="$fn"
|
|
|
|
ntempfiles=$(($ntempfiles + 1))
|
|
|
|
echo "$fn"
|
|
|
|
}
|
|
|
|
|
|
|
|
function cleanup() {
|
|
|
|
local x
|
|
|
|
for x in ${!tempfile[@]}; do
|
|
|
|
[[ -e ${tempfile[$x]} ]] && rm -f "${tempfile[$x]}"
|
|
|
|
done
|
|
|
|
}
|
|
|
|
|
|
|
|
function addsite() {
|
|
|
|
site_name[$nsites]="$1"
|
|
|
|
site_url[$nsites]="$2"
|
|
|
|
site_re_start[$nsites]="$3"
|
|
|
|
site_re_end[$nsites]="$4"
|
|
|
|
nsites=$((nsites + 1))
|
|
|
|
}
|
|
|
|
|
|
|
|
function fatal() {
|
|
|
|
echo -e "FATAL: $*" >&2
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
|
|
|
|
function showsite() { #1=sitename
|
|
|
|
local x lookfor="$1" idx=-1
|
|
|
|
[[ -z $lookfor ]] && return 1
|
|
|
|
for x in ${!site_name[@]}; do
|
|
|
|
if [[ ${site_name[$x]} == "$lookfor" ]]; then
|
|
|
|
idx=${x}
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
if [[ $idx == -1 ]]; then
|
|
|
|
return 1
|
|
|
|
fi
|
|
|
|
showsite_byidx "$idx"
|
|
|
|
}
|
|
|
|
|
|
|
|
function showsite_byidx() { #1=idx
|
|
|
|
local idx="$1"
|
|
|
|
printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}"
|
|
|
|
}
|
|
|
|
|
|
|
|
trap cleanup EXIT TERM
|
|
|
|
|
|
|
|
|
|
|
|
# Defaults
|
|
|
|
OUTDIR="${DEFAULT_OUTDIR}"
|
|
|
|
CONFIG="${DEFAULT_CONFIG}"
|
|
|
|
SEP="${DEFAULT_SEP}"
|
|
|
|
DOGITCOMMIT=0
|
|
|
|
DOGITPUSH=0
|
|
|
|
MODE="normal"
|
|
|
|
TESTSITE=""
|
|
|
|
TEST_USERES=0
|
|
|
|
|
|
|
|
ARGS="hc:F:g:lo:t:T:"
|
|
|
|
while getopts "$ARGS" i; do
|
|
|
|
case "$i" in
|
|
|
|
h)
|
|
|
|
usage;
|
|
|
|
exit 1;
|
|
|
|
;;
|
|
|
|
g)
|
|
|
|
if [[ $OPTARG == "c" ]]; then
|
|
|
|
DOGITCOMMIT=1
|
|
|
|
DOGITPUSH=0
|
|
|
|
elif [[ $OPTARG == "p" ]]; then
|
|
|
|
DOGITCOMMIT=1
|
|
|
|
DOGITPUSH=1
|
|
|
|
else
|
|
|
|
fatal "invalid git subargument - must use -gc or -gp"
|
|
|
|
fi
|
|
|
|
;;
|
|
|
|
l)
|
|
|
|
MODE="list"
|
|
|
|
;;
|
|
|
|
o)
|
|
|
|
OUTDIR="$OPTARG"
|
|
|
|
;;
|
|
|
|
c)
|
|
|
|
CONFIG="$OPTARG"
|
|
|
|
;;
|
|
|
|
F)
|
|
|
|
SEP="$OPTARG"
|
|
|
|
;;
|
|
|
|
t)
|
|
|
|
MODE="test"
|
|
|
|
TESTSITE="$OPTARG"
|
|
|
|
TEST_USERES=0
|
|
|
|
;;
|
|
|
|
T)
|
|
|
|
MODE="test"
|
|
|
|
TESTSITE="$OPTARG"
|
|
|
|
TEST_USERES=1
|
|
|
|
;;
|
|
|
|
*)
|
|
|
|
echo "invalid option '$i'" >&2
|
|
|
|
usage;
|
|
|
|
exit 1;
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
done
|
|
|
|
shift $((OPTIND - 1))
|
|
|
|
|
|
|
|
if [[ $# -ne 0 ]]; then
|
|
|
|
usage
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
2023-12-01 15:08:32 +11:00
|
|
|
SED=$(which gsed 2>/dev/null)
|
|
|
|
[[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null)
|
|
|
|
[[ -z $SED ]] && fatal "can't find sed in path"
|
|
|
|
|
2023-12-01 14:53:12 +11:00
|
|
|
# Read config file
|
|
|
|
if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then
|
|
|
|
fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file"
|
|
|
|
fi
|
|
|
|
wantfields=4
|
|
|
|
while read line; do
|
|
|
|
[[ -z $line ]] && continue
|
|
|
|
[[ $line =~ ^# ]] && continue
|
|
|
|
nfields=$(echo "1 + $(grep -o @ <<<"$line" | wc -l)" | bc)
|
|
|
|
if [[ $nfields -ne $wantfields ]]; then
|
|
|
|
fatal "wrong number of fields in config line (want $wantfields, got $nfields):\n$line\n"
|
|
|
|
fi
|
|
|
|
|
|
|
|
sitename=$(awk -F "$SEP" '{ print $1 }' <<<"$line")
|
|
|
|
url=$(awk -F "$SEP" '{ print $2 }' <<<"$line")
|
|
|
|
re_start="$(awk -F "$SEP" '{ print $3 }' <<<"$line")"
|
|
|
|
re_end="$(awk -F "$SEP" '{ print $4 }' <<<"$line")"
|
|
|
|
|
|
|
|
addsite "$sitename" "$url" "$re_start" "$re_end"
|
|
|
|
|
|
|
|
#echo "sitename: [$sitename]"
|
|
|
|
#echo "url: [$url]"
|
|
|
|
#echo "start: [$re_start]"
|
|
|
|
#echo "end: [$re_end]"
|
|
|
|
done < ${CONFIG}
|
|
|
|
|
|
|
|
|
|
|
|
if [[ $MODE == "list" ]]; then
|
|
|
|
printf "$FORMAT" "Site Name" "URL"
|
|
|
|
for x in ${!site_name[@]}; do
|
|
|
|
showsite_byidx "$x"
|
|
|
|
done
|
|
|
|
exit 0
|
|
|
|
elif [[ $MODE == "test" ]]; then
|
|
|
|
if [[ -n $TESTSITE ]]; then
|
|
|
|
idx=-1
|
|
|
|
for x in ${!site_name[@]}; do
|
|
|
|
if [[ ${site_name[$x]} == $TESTSITE ]]; then
|
|
|
|
idx="$x"
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
if [[ $idx == -1 ]]; then
|
|
|
|
fatal "Requested site '$TESTSITE' not found in config file ($CONFIG)."
|
|
|
|
fi
|
|
|
|
url="${site_url[$idx]}"
|
|
|
|
|
|
|
|
temp=$(newtempfile)
|
2023-12-01 15:08:32 +11:00
|
|
|
|
|
|
|
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > ${temp}
|
2023-12-01 14:53:12 +11:00
|
|
|
if [[ $TEST_USERES -eq 1 ]]; then
|
|
|
|
sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p"
|
2023-12-01 15:08:32 +11:00
|
|
|
cat "$temp" | ${SED} -n "$sedcmd"
|
2023-12-01 14:53:12 +11:00
|
|
|
else
|
|
|
|
cat "$temp"
|
|
|
|
fi
|
|
|
|
rm -f "$temp"
|
|
|
|
exit $?
|
|
|
|
else
|
|
|
|
fatal "Test mode in use, but \$TESTSITE is empty."
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
# confirm output dir
|
|
|
|
if [[ -e ${OUTPUT} ]]; then
|
|
|
|
if [[ ! -d ${OUTDIR} ]]; then
|
|
|
|
fatal "output directory '$OUTDIR' already exists but is not a directory"
|
|
|
|
fi
|
|
|
|
else
|
|
|
|
mkdir -p "${OUTDIR}"
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ $DOGITPUSH -eq 1 && $DOGITCOMMIT -ne 1 ]]; then
|
|
|
|
fatal "git push option cannot be used without git commit option"
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ $DOGITCOMMIT -eq 1 ]]; then
|
|
|
|
if [[ ! -d ${OUTDIR}/.git ]]; then
|
|
|
|
fatal "error - git commit option used, but output directory '$OUTDIR' doesn't seem to be a git repo"
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ $DOGITPUSH -eq 1 ]]; then
|
|
|
|
grep -q "\[remote" ${OUTDIR}/.git/config 2>/dev/null
|
|
|
|
if [[ $? -ne 0 ]]; then
|
|
|
|
fatal "git push option used, but output git repo '$OUTDIR' doesn't have any remotes"
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
filesdone=""
|
|
|
|
sitesdone=""
|
|
|
|
summary=""
|
|
|
|
for x in ${!site_name[@]}; do
|
|
|
|
sitename="${site_name[$x]}"
|
|
|
|
url="${site_url[$x]}"
|
|
|
|
re_start="${site_re_start[$x]}"
|
|
|
|
re_end="${site_re_end[$x]}"
|
|
|
|
|
|
|
|
if [[ -z $re_start || -z $re_end ]]; then
|
|
|
|
# skip processing
|
|
|
|
continue
|
|
|
|
fi
|
|
|
|
|
|
|
|
thisfile="${sitename}.txt"
|
|
|
|
outfile="${OUTDIR}/${thisfile}"
|
|
|
|
temp=$(newtempfile)
|
|
|
|
temp2=$(newtempfile)
|
2023-12-01 15:08:32 +11:00
|
|
|
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${temp}"
|
2023-12-01 14:53:12 +11:00
|
|
|
sedcmd="/${re_start}/,/${re_end}/p"
|
|
|
|
|
|
|
|
echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2}
|
2023-12-01 15:08:32 +11:00
|
|
|
cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2}
|
2023-12-01 14:53:12 +11:00
|
|
|
|
|
|
|
if [[ -e ${outfile} ]]; then
|
|
|
|
# has it changed?
|
|
|
|
diff "$temp2" "$outfile" &>/dev/null
|
|
|
|
[[ $? -eq 0 ]] && changed=0 || changed=1
|
|
|
|
else
|
|
|
|
changed=1
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ $changed -eq 1 ]]; then
|
|
|
|
text="UPDATES FOUND"
|
|
|
|
else
|
|
|
|
text="no change"
|
|
|
|
|
|
|
|
fi
|
|
|
|
cp -a "${temp2}" "${outfile}"
|
|
|
|
echo "Scraped '${sitename}' to '${outfile}' ($text)"
|
|
|
|
rm -f "${temp}" "${temp2}"
|
|
|
|
|
|
|
|
if [[ $changed -eq 1 ]]; then
|
|
|
|
sitesdone="$sitesdone $sitename"
|
|
|
|
filesdone="$filesdone $thisfile"
|
|
|
|
if [[ -z $summary ]]; then
|
|
|
|
summary="$(showsite $sitename)"
|
|
|
|
else
|
|
|
|
summary=$(echo -e "$summary\n$(showsite $sitename)")
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
|
|
|
if [[ $DOGITCOMMIT -eq 1 ]]; then
|
|
|
|
if [[ -n $filesdone ]]; then
|
|
|
|
echo -n "Doing git add..." >&2
|
|
|
|
|
|
|
|
msg="Policies have been updated for the following sites:"
|
|
|
|
msg=$(echo -e "Policies have been updated for the following sites:\n$summary")
|
|
|
|
|
|
|
|
pushd "${OUTDIR}" &>/dev/null
|
|
|
|
|
|
|
|
fd=( $filesdone )
|
|
|
|
res=$(git add ${fd[@]} 2>&1)
|
|
|
|
[[ $? -eq 0 ]] && echo "ok" >&2 || { echo "failed" >&2; fatal "git add failed:\n$res\n"; }
|
|
|
|
|
|
|
|
echo -n "Doing git commit..." >&2
|
|
|
|
res=$(git commit -m "$msg" 2>&1)
|
|
|
|
if [[ $? -eq 0 ]]; then
|
|
|
|
echo "ok" >&2
|
|
|
|
elif [[ $res == *"othing to commit"* ]]; then
|
|
|
|
echo "ok (no changes)" >&2
|
|
|
|
else
|
|
|
|
echo "failed" >&2
|
|
|
|
fatal "git commit failed:\n$res\n"
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ $DOGITPUSH -eq 1 ]]; then
|
|
|
|
echo -n "Doing git push..." >&2
|
|
|
|
res=$(git push 2>&1)
|
|
|
|
[[ $? -eq 0 ]] && echo "ok" >&2 || { echo "failed" >&2; fatal "git push failed:\n$res\n"; }
|
|
|
|
fi
|
|
|
|
popd &>/dev/null
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
exit 0
|