Add -L option tk use lynx to render html

This commit is contained in:
Rob Pearce 2024-06-30 09:40:51 +10:00
parent 391d137a21
commit 2825e35152
2 changed files with 25 additions and 6 deletions

View File

@ -15,6 +15,8 @@ OPTIONS:
-F char Use given character as a field separator in config file instead of default (@) -F char Use given character as a field separator in config file instead of default (@)
-gc After site scrapes, run 'git add' on all files, then 'git commit' -gc After site scrapes, run 'git add' on all files, then 'git commit'
-gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push' -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'
-l List configured sites then exit
-L Use lynx to render html
-o dirname Use given output dir instead of default (.) -o dirname Use given output dir instead of default (.)
-t sitename Just output raw content of given site, useful for finding start/end regexps. -t sitename Just output raw content of given site, useful for finding start/end regexps.
-T sitename Just output content of given site between re_start and re_end regexps. -T sitename Just output content of given site between re_start and re_end regexps.

View File

@ -18,6 +18,8 @@ function usage() {
echo " -F char Use given character as a field separator in config file instead of default (${SEP})" echo " -F char Use given character as a field separator in config file instead of default (${SEP})"
echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'" echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'"
echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'" echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'"
echo " -l List configured sites then exit"
echo " -L Use lynx to render html"
echo " -o dirname Use given output dir instead of default (.)" echo " -o dirname Use given output dir instead of default (.)"
echo " -t sitename Just output raw content of given site, useful for finding start/end regexps." echo " -t sitename Just output raw content of given site, useful for finding start/end regexps."
echo " -T sitename Just output content of given site between re_start and re_end regexps." echo " -T sitename Just output content of given site between re_start and re_end regexps."
@ -77,6 +79,17 @@ function showsite_byidx() { #1=idx
printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}" printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}"
} }
function getcontent() { # 1=url 2=outputfile
local url="$1"
local outfile="$2"
if [[ -n $LYNX ]]; then
curl -sL "$url" | ${LYNX} -stdin -dump > "${out}"
else
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${out}"
fi
}
trap cleanup EXIT TERM trap cleanup EXIT TERM
@ -89,8 +102,9 @@ DOGITPUSH=0
MODE="normal" MODE="normal"
TESTSITE="" TESTSITE=""
TEST_USERES=0 TEST_USERES=0
LYNX=""
ARGS="hc:F:g:lo:t:T:" ARGS="hc:F:g:lLo:t:T:"
while getopts "$ARGS" i; do while getopts "$ARGS" i; do
case "$i" in case "$i" in
h) h)
@ -111,6 +125,10 @@ while getopts "$ARGS" i; do
l) l)
MODE="list" MODE="list"
;; ;;
L)
LYNX=$(type -p lynx 2>/dev/null)
[[ $? -ne 0 ]] && fatal "lynx not found in path"
;;
o) o)
OUTDIR="$OPTARG" OUTDIR="$OPTARG"
;; ;;
@ -148,7 +166,6 @@ fi
SED=$(which gsed 2>/dev/null) SED=$(which gsed 2>/dev/null)
[[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null) [[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null)
[[ -z $SED ]] && fatal "can't find sed in path" [[ -z $SED ]] && fatal "can't find sed in path"
# Read config file # Read config file
if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then
fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file" fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file"
@ -197,8 +214,7 @@ elif [[ $MODE == "test" ]]; then
url="${site_url[$idx]}" url="${site_url[$idx]}"
temp=$(newtempfile) temp=$(newtempfile)
getcontent "$url" "$temp"
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > ${temp}
if [[ $TEST_USERES -eq 1 ]]; then if [[ $TEST_USERES -eq 1 ]]; then
sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p" sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p"
cat "$temp" | ${SED} -n "$sedcmd" cat "$temp" | ${SED} -n "$sedcmd"
@ -256,10 +272,11 @@ for x in ${!site_name[@]}; do
outfile="${OUTDIR}/${thisfile}" outfile="${OUTDIR}/${thisfile}"
temp=$(newtempfile) temp=$(newtempfile)
temp2=$(newtempfile) temp2=$(newtempfile)
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${temp}" getcontent "$url" "$temp"
sedcmd="/${re_start}/,/${re_end}/p" sedcmd="/${re_start}/,/${re_end}/p"
echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2} echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2}
cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2} cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2}
if [[ -e ${outfile} ]]; then if [[ -e ${outfile} ]]; then
@ -293,7 +310,7 @@ done
if [[ $DOGITCOMMIT -eq 1 ]]; then if [[ $DOGITCOMMIT -eq 1 ]]; then
if [[ -n $filesdone ]]; then if [[ -n $filesdone ]]; then
echo -n "Doing git add..." >&2 echo -n "Doing git add for [${filesdone}]..." >&2
msg="Policies have been updated for the following sites:" msg="Policies have been updated for the following sites:"
msg=$(echo -e "Policies have been updated for the following sites:\n$summary") msg=$(echo -e "Policies have been updated for the following sites:\n$summary")