Add -L option tk use lynx to render html
This commit is contained in:
parent
391d137a21
commit
2825e35152
|
@ -15,6 +15,8 @@ OPTIONS:
|
|||
-F char Use given character as a field separator in config file instead of default (@)
|
||||
-gc After site scrapes, run 'git add' on all files, then 'git commit'
|
||||
-gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'
|
||||
-l List configured sites then exit
|
||||
-L Use lynx to render html
|
||||
-o dirname Use given output dir instead of default (.)
|
||||
-t sitename Just output raw content of given site, useful for finding start/end regexps.
|
||||
-T sitename Just output content of given site between re_start and re_end regexps.
|
||||
|
|
|
@ -18,6 +18,8 @@ function usage() {
|
|||
echo " -F char Use given character as a field separator in config file instead of default (${SEP})"
|
||||
echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'"
|
||||
echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'"
|
||||
echo " -l List configured sites then exit"
|
||||
echo " -L Use lynx to render html"
|
||||
echo " -o dirname Use given output dir instead of default (.)"
|
||||
echo " -t sitename Just output raw content of given site, useful for finding start/end regexps."
|
||||
echo " -T sitename Just output content of given site between re_start and re_end regexps."
|
||||
|
@ -77,6 +79,17 @@ function showsite_byidx() { #1=idx
|
|||
printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}"
|
||||
}
|
||||
|
||||
function getcontent() { # 1=url 2=outputfile
|
||||
local url="$1"
|
||||
local outfile="$2"
|
||||
|
||||
if [[ -n $LYNX ]]; then
|
||||
curl -sL "$url" | ${LYNX} -stdin -dump > "${out}"
|
||||
else
|
||||
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${out}"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT TERM
|
||||
|
||||
|
||||
|
@ -89,8 +102,9 @@ DOGITPUSH=0
|
|||
MODE="normal"
|
||||
TESTSITE=""
|
||||
TEST_USERES=0
|
||||
LYNX=""
|
||||
|
||||
ARGS="hc:F:g:lo:t:T:"
|
||||
ARGS="hc:F:g:lLo:t:T:"
|
||||
while getopts "$ARGS" i; do
|
||||
case "$i" in
|
||||
h)
|
||||
|
@ -111,6 +125,10 @@ while getopts "$ARGS" i; do
|
|||
l)
|
||||
MODE="list"
|
||||
;;
|
||||
L)
|
||||
LYNX=$(type -p lynx 2>/dev/null)
|
||||
[[ $? -ne 0 ]] && fatal "lynx not found in path"
|
||||
;;
|
||||
o)
|
||||
OUTDIR="$OPTARG"
|
||||
;;
|
||||
|
@ -148,7 +166,6 @@ fi
|
|||
SED=$(which gsed 2>/dev/null)
|
||||
[[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null)
|
||||
[[ -z $SED ]] && fatal "can't find sed in path"
|
||||
|
||||
# Read config file
|
||||
if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then
|
||||
fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file"
|
||||
|
@ -197,8 +214,7 @@ elif [[ $MODE == "test" ]]; then
|
|||
url="${site_url[$idx]}"
|
||||
|
||||
temp=$(newtempfile)
|
||||
|
||||
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > ${temp}
|
||||
getcontent "$url" "$temp"
|
||||
if [[ $TEST_USERES -eq 1 ]]; then
|
||||
sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p"
|
||||
cat "$temp" | ${SED} -n "$sedcmd"
|
||||
|
@ -256,10 +272,11 @@ for x in ${!site_name[@]}; do
|
|||
outfile="${OUTDIR}/${thisfile}"
|
||||
temp=$(newtempfile)
|
||||
temp2=$(newtempfile)
|
||||
curl -sL "$url" | ${SED} 's/>/>\n/g;s/</\n</g;' | awk NF | egrep -v "^<.*>$" > "${temp}"
|
||||
getcontent "$url" "$temp"
|
||||
sedcmd="/${re_start}/,/${re_end}/p"
|
||||
|
||||
echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2}
|
||||
|
||||
cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2}
|
||||
|
||||
if [[ -e ${outfile} ]]; then
|
||||
|
@ -293,7 +310,7 @@ done
|
|||
|
||||
if [[ $DOGITCOMMIT -eq 1 ]]; then
|
||||
if [[ -n $filesdone ]]; then
|
||||
echo -n "Doing git add..." >&2
|
||||
echo -n "Doing git add for [${filesdone}]..." >&2
|
||||
|
||||
msg="Policies have been updated for the following sites:"
|
||||
msg=$(echo -e "Policies have been updated for the following sites:\n$summary")
|
||||
|
|
Loading…
Reference in New Issue