tostracker/tostracker.sh

345 lines
8.9 KiB
Bash
Raw Normal View History

2023-12-01 14:53:12 +11:00
#!/usr/bin/env bash
# format:
#
# url1 start_regexp1 end_regexp1
#
ntempfiles=0
DEFAULT_SEP="@"
DEFAULT_OUTDIR="${HOME}/.tostracker/output"
DEFAULT_CONFIG="${HOME}/.tostracker/config"
FORMAT="%-32s %s\n"
function usage() {
echo "usage: $0 OPTIONS"
echo
echo "OPTIONS:"
echo " -c filename Use given config file instead of default (./config)"
echo " -F char Use given character as a field separator in config file instead of default (${SEP})"
echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'"
echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'"
echo " -l List configured sites then exit"
echo " -L Use lynx to render html"
2023-12-01 14:53:12 +11:00
echo " -o dirname Use given output dir instead of default (.)"
echo " -t sitename Just output raw content of given site, useful for finding start/end regexps."
echo " -T sitename Just output content of given site between re_start and re_end regexps."
echo
echo ""
}
function newtempfile() {
local fn
fn=$(mktemp /tmp/tostracker.XXXXXX)
if [[ $? -ne 0 ]]; then
echo "failed to create temp file" >&2
exit 1
fi
tempfile[$ntempfiles]="$fn"
ntempfiles=$(($ntempfiles + 1))
echo "$fn"
}
function cleanup() {
local x
for x in ${!tempfile[@]}; do
[[ -e ${tempfile[$x]} ]] && rm -f "${tempfile[$x]}"
done
}
function addsite() {
site_name[$nsites]="$1"
site_url[$nsites]="$2"
site_re_start[$nsites]="$3"
site_re_end[$nsites]="$4"
nsites=$((nsites + 1))
}
function fatal() {
echo -e "FATAL: $*" >&2
exit 1
}
function showsite() { #1=sitename
local x lookfor="$1" idx=-1
[[ -z $lookfor ]] && return 1
for x in ${!site_name[@]}; do
if [[ ${site_name[$x]} == "$lookfor" ]]; then
idx=${x}
break
fi
done
if [[ $idx == -1 ]]; then
return 1
fi
showsite_byidx "$idx"
}
function showsite_byidx() { #1=idx
local idx="$1"
printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}"
}
function getcontent() { # 1=url 2=outputfile
local url="$1"
local outfile="$2"
if [[ -n $LYNX ]]; then
2024-06-30 09:54:34 +10:00
curl -sL "$url" | ${LYNX} -stdin -dump -nolist > "${outfile}"
else
2024-06-30 09:43:42 +10:00
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${outfile}"
fi
}
2023-12-01 14:53:12 +11:00
trap cleanup EXIT TERM
# Defaults
OUTDIR="${DEFAULT_OUTDIR}"
CONFIG="${DEFAULT_CONFIG}"
SEP="${DEFAULT_SEP}"
DOGITCOMMIT=0
DOGITPUSH=0
MODE="normal"
TESTSITE=""
TEST_USERES=0
LYNX=""
2023-12-01 14:53:12 +11:00
ARGS="hc:F:g:lLo:t:T:"
2023-12-01 14:53:12 +11:00
while getopts "$ARGS" i; do
case "$i" in
h)
usage;
exit 1;
;;
g)
if [[ $OPTARG == "c" ]]; then
DOGITCOMMIT=1
DOGITPUSH=0
elif [[ $OPTARG == "p" ]]; then
DOGITCOMMIT=1
DOGITPUSH=1
else
fatal "invalid git subargument - must use -gc or -gp"
fi
;;
l)
MODE="list"
;;
L)
LYNX=$(type -p lynx 2>/dev/null)
[[ $? -ne 0 ]] && fatal "lynx not found in path"
;;
2023-12-01 14:53:12 +11:00
o)
OUTDIR="$OPTARG"
;;
c)
CONFIG="$OPTARG"
;;
F)
SEP="$OPTARG"
;;
t)
MODE="test"
TESTSITE="$OPTARG"
TEST_USERES=0
;;
T)
MODE="test"
TESTSITE="$OPTARG"
TEST_USERES=1
;;
*)
echo "invalid option '$i'" >&2
usage;
exit 1;
;;
esac
done
shift $((OPTIND - 1))
if [[ $# -ne 0 ]]; then
usage
exit 1
fi
2023-12-01 15:08:32 +11:00
SED=$(which gsed 2>/dev/null)
[[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null)
[[ -z $SED ]] && fatal "can't find sed in path"
2023-12-01 14:53:12 +11:00
# Read config file
if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then
fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file"
fi
wantfields=4
while read line; do
[[ -z $line ]] && continue
[[ $line =~ ^# ]] && continue
nfields=$(echo "1 + $(grep -o @ <<<"$line" | wc -l)" | bc)
if [[ $nfields -ne $wantfields ]]; then
fatal "wrong number of fields in config line (want $wantfields, got $nfields):\n$line\n"
fi
sitename=$(awk -F "$SEP" '{ print $1 }' <<<"$line")
url=$(awk -F "$SEP" '{ print $2 }' <<<"$line")
re_start="$(awk -F "$SEP" '{ print $3 }' <<<"$line")"
re_end="$(awk -F "$SEP" '{ print $4 }' <<<"$line")"
addsite "$sitename" "$url" "$re_start" "$re_end"
#echo "sitename: [$sitename]"
#echo "url: [$url]"
#echo "start: [$re_start]"
#echo "end: [$re_end]"
done < ${CONFIG}
if [[ $MODE == "list" ]]; then
printf "$FORMAT" "Site Name" "URL"
for x in ${!site_name[@]}; do
showsite_byidx "$x"
done
exit 0
elif [[ $MODE == "test" ]]; then
if [[ -n $TESTSITE ]]; then
idx=-1
for x in ${!site_name[@]}; do
if [[ ${site_name[$x]} == $TESTSITE ]]; then
idx="$x"
break
fi
done
if [[ $idx == -1 ]]; then
fatal "Requested site '$TESTSITE' not found in config file ($CONFIG)."
fi
url="${site_url[$idx]}"
temp=$(newtempfile)
getcontent "$url" "$temp"
2023-12-01 14:53:12 +11:00
if [[ $TEST_USERES -eq 1 ]]; then
sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p"
2023-12-01 15:08:32 +11:00
cat "$temp" | ${SED} -n "$sedcmd"
2023-12-01 14:53:12 +11:00
else
cat "$temp"
fi
rm -f "$temp"
exit $?
else
fatal "Test mode in use, but \$TESTSITE is empty."
fi
fi
# confirm output dir
if [[ -e ${OUTPUT} ]]; then
if [[ ! -d ${OUTDIR} ]]; then
fatal "output directory '$OUTDIR' already exists but is not a directory"
fi
else
mkdir -p "${OUTDIR}"
fi
if [[ $DOGITPUSH -eq 1 && $DOGITCOMMIT -ne 1 ]]; then
fatal "git push option cannot be used without git commit option"
fi
if [[ $DOGITCOMMIT -eq 1 ]]; then
if [[ ! -d ${OUTDIR}/.git ]]; then
fatal "error - git commit option used, but output directory '$OUTDIR' doesn't seem to be a git repo"
fi
fi
if [[ $DOGITPUSH -eq 1 ]]; then
grep -q "\[remote" ${OUTDIR}/.git/config 2>/dev/null
if [[ $? -ne 0 ]]; then
fatal "git push option used, but output git repo '$OUTDIR' doesn't have any remotes"
fi
fi
filesdone=""
sitesdone=""
summary=""
for x in ${!site_name[@]}; do
sitename="${site_name[$x]}"
url="${site_url[$x]}"
re_start="${site_re_start[$x]}"
re_end="${site_re_end[$x]}"
if [[ -z $re_start || -z $re_end ]]; then
# skip processing
continue
fi
thisfile="${sitename}.txt"
outfile="${OUTDIR}/${thisfile}"
temp=$(newtempfile)
temp2=$(newtempfile)
getcontent "$url" "$temp"
2023-12-01 14:53:12 +11:00
sedcmd="/${re_start}/,/${re_end}/p"
echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2}
2023-12-01 15:08:32 +11:00
cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2}
2023-12-01 14:53:12 +11:00
if [[ -e ${outfile} ]]; then
# has it changed?
diff "$temp2" "$outfile" &>/dev/null
[[ $? -eq 0 ]] && changed=0 || changed=1
else
changed=1
fi
if [[ $changed -eq 1 ]]; then
text="UPDATES FOUND"
else
text="no change"
fi
cp -a "${temp2}" "${outfile}"
echo "Scraped '${sitename}' to '${outfile}' ($text)"
rm -f "${temp}" "${temp2}"
if [[ $changed -eq 1 ]]; then
sitesdone="$sitesdone $sitename"
filesdone="$filesdone $thisfile"
if [[ -z $summary ]]; then
summary="$(showsite $sitename)"
else
summary=$(echo -e "$summary\n$(showsite $sitename)")
fi
fi
done
if [[ $DOGITCOMMIT -eq 1 ]]; then
if [[ -n $filesdone ]]; then
echo -n "Doing git add for [${filesdone}]..." >&2
2023-12-01 14:53:12 +11:00
msg="Policies have been updated for the following sites:"
msg=$(echo -e "Policies have been updated for the following sites:\n$summary")
pushd "${OUTDIR}" &>/dev/null
fd=( $filesdone )
res=$(git add ${fd[@]} 2>&1)
[[ $? -eq 0 ]] && echo "ok" >&2 || { echo "failed" >&2; fatal "git add failed:\n$res\n"; }
echo -n "Doing git commit..." >&2
res=$(git commit -m "$msg" 2>&1)
if [[ $? -eq 0 ]]; then
echo "ok" >&2
elif [[ $res == *"othing to commit"* ]]; then
echo "ok (no changes)" >&2
else
echo "failed" >&2
fatal "git commit failed:\n$res\n"
fi
if [[ $DOGITPUSH -eq 1 ]]; then
echo -n "Doing git push..." >&2
res=$(git push 2>&1)
[[ $? -eq 0 ]] && echo "ok" >&2 || { echo "failed" >&2; fatal "git push failed:\n$res\n"; }
fi
popd &>/dev/null
fi
fi
exit 0