diff --git a/README.md b/README.md index 73e9209..5916cf5 100755 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ OPTIONS: -F char Use given character as a field separator in config file instead of default (@) -gc After site scrapes, run 'git add' on all files, then 'git commit' -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push' + -l List configured sites then exit + -L Use lynx to render html -o dirname Use given output dir instead of default (.) -t sitename Just output raw content of given site, useful for finding start/end regexps. -T sitename Just output content of given site between re_start and re_end regexps. diff --git a/tostracker.sh b/tostracker.sh index f8bc8f5..d74e5ef 100755 --- a/tostracker.sh +++ b/tostracker.sh @@ -18,6 +18,8 @@ function usage() { echo " -F char Use given character as a field separator in config file instead of default (${SEP})" echo " -gc After site scrapes, run 'git add' on all files, then 'git commit'" echo " -gp After site scrapes, run 'git add' on all files, then 'git commit', then 'git push'" + echo " -l List configured sites then exit" + echo " -L Use lynx to render html" echo " -o dirname Use given output dir instead of default (.)" echo " -t sitename Just output raw content of given site, useful for finding start/end regexps." echo " -T sitename Just output content of given site between re_start and re_end regexps." @@ -77,6 +79,17 @@ function showsite_byidx() { #1=idx printf "$FORMAT" "${site_name[$idx]}" "${site_url[$idx]}" } +function getcontent() { # 1=url 2=outputfile + local url="$1" + local outfile="$2" + + if [[ -n $LYNX ]]; then + curl -sL "$url" | ${LYNX} -stdin -dump > "${out}" + else + curl -sL "$url" | ${SED} 's/>/>\n/g;s/$" > "${out}" + fi +} + trap cleanup EXIT TERM @@ -89,8 +102,9 @@ DOGITPUSH=0 MODE="normal" TESTSITE="" TEST_USERES=0 +LYNX="" -ARGS="hc:F:g:lo:t:T:" +ARGS="hc:F:g:lLo:t:T:" while getopts "$ARGS" i; do case "$i" in h) @@ -111,6 +125,10 @@ while getopts "$ARGS" i; do l) MODE="list" ;; + L) + LYNX=$(type -p lynx 2>/dev/null) + [[ $? -ne 0 ]] && fatal "lynx not found in path" + ;; o) OUTDIR="$OPTARG" ;; @@ -148,7 +166,6 @@ fi SED=$(which gsed 2>/dev/null) [[ $? -ne 0 ]] && SED=$(which sed 2>/dev/null) [[ -z $SED ]] && fatal "can't find sed in path" - # Read config file if [[ ! -e ${CONFIG} || ! -f ${CONFIG} ]]; then fatal "config file '$CONFIG' doesn't exist or isn't a plaintext file" @@ -197,8 +214,7 @@ elif [[ $MODE == "test" ]]; then url="${site_url[$idx]}" temp=$(newtempfile) - - curl -sL "$url" | ${SED} 's/>/>\n/g;s/$" > ${temp} + getcontent "$url" "$temp" if [[ $TEST_USERES -eq 1 ]]; then sedcmd="/${site_re_start[$idx]}/,/${site_re_end[$idx]}/p" cat "$temp" | ${SED} -n "$sedcmd" @@ -256,10 +272,11 @@ for x in ${!site_name[@]}; do outfile="${OUTDIR}/${thisfile}" temp=$(newtempfile) temp2=$(newtempfile) - curl -sL "$url" | ${SED} 's/>/>\n/g;s/$" > "${temp}" + getcontent "$url" "$temp" sedcmd="/${re_start}/,/${re_end}/p" echo -e "SITE: ${sitename}\nURL: $url\n\n" >${temp2} + cat "$temp" | ${SED} -n "$sedcmd" >> ${temp2} if [[ -e ${outfile} ]]; then @@ -293,7 +310,7 @@ done if [[ $DOGITCOMMIT -eq 1 ]]; then if [[ -n $filesdone ]]; then - echo -n "Doing git add..." >&2 + echo -n "Doing git add for [${filesdone}]..." >&2 msg="Policies have been updated for the following sites:" msg=$(echo -e "Policies have been updated for the following sites:\n$summary")