Add -L option tk use lynx to render html

This commit is contained in:
Rob Pearce 2024-06-30 09:40:51 +10:00
parent 391d137a21
commit 2825e35152
2 changed files with 25 additions and 6 deletions

View File

@ -15,6 +15,8 @@ OPTIONS:
-F char Use given character as a field separator in config file instead of default (@)
-gc After site scrapes, run 'git add' on all files, then 'git commit'
-gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'
-l List configured sites then exit
-L Use lynx to render html
-o dirname Use given output dir instead of default (.)
-t sitename Just output raw content of given site, useful for finding start/end regexps.
-T sitename Just output content of given site between re_start and re_end regexps.

View File

@ -18,6 +18,8 @@ function usage() {
echo " -F char Use given character as a field separator in config file instead of default (${SEP})"
echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'"
echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'"
echo " -l List configured sites then exit"
echo " -L Use lynx to render html"
echo " -o dirname Use given output dir instead of default (.)"
echo " -t sitename Just output raw content of given site, useful for finding start/end regexps."
echo " -T sitename Just output content of given site between re_start and re_end regexps."
@ -77,6 +79,17 @@ function showsite_byidx() { #1=idx
printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}"
}
function getcontent() { # 1=url 2=outputfile
local url="$1"
local outfile="$2"
if [[ -n $LYNX ]]; then
curl -sL "$url" | ${LYNX} -stdin -dump > "${out}"
else
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${out}"
fi
}
trap cleanup EXIT TERM
@ -89,8 +102,9 @@ DOGITPUSH=0
MODE="normal"
TESTSITE=""
TEST_USERES=0
LYNX=""
ARGS="hc:F:g:lo:t:T:"
ARGS="hc:F:g:lLo:t:T:"
while getopts "$ARGS" i; do
case "$i" in
h)
@ -111,6 +125,10 @@ while getopts "$ARGS" i; do
l)
MODE="list"
;;
L)
LYNX=$(type -p lynx 2>/dev/null)
[[ $? -ne 0 ]] && fatal "lynx not found in path"
;;
o)
OUTDIR="$OPTARG"
;;
@ -148,7 +166,6 @@ fi
SED=$(which gsed 2>/dev/null)
[[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null)
[[ -z $SED ]] && fatal "can't find sed in path"
# Read config file
if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then
fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file"
@ -197,8 +214,7 @@ elif [[ $MODE == "test" ]]; then
url="${site_url[$idx]}"
temp=$(newtempfile)
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > ${temp}
getcontent "$url" "$temp"
if [[ $TEST_USERES -eq 1 ]]; then
sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p"
cat "$temp" | ${SED} -n "$sedcmd"
@ -256,10 +272,11 @@ for x in ${!site_name[@]}; do
outfile="${OUTDIR}/${thisfile}"
temp=$(newtempfile)
temp2=$(newtempfile)
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${temp}"
getcontent "$url" "$temp"
sedcmd="/${re_start}/,/${re_end}/p"
echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2}
cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2}
if [[ -e ${outfile} ]]; then
@ -293,7 +310,7 @@ done
if [[ $DOGITCOMMIT -eq 1 ]]; then
if [[ -n $filesdone ]]; then
echo -n "Doing git add..." >&2
echo -n "Doing git add for [${filesdone}]..." >&2
msg="Policies have been updated for the following sites:"
msg=$(echo -e "Policies have been updated for the following sites:\n$summary")