Mercurial > hg > cc > azure
diff master/src/wecu/run_sac.sh @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh Sun May 31 12:06:44 2020 +0000 +++ b/master/src/wecu/run_sac.sh Tue Jun 02 17:35:07 2020 +0000 @@ -1,5 +1,6 @@ #!/bin/bash -# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns +# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns +echo "$@" 1>cmd cores=$1 hosts=$2 wd=$3 @@ -16,13 +17,18 @@ else filter=\"\" fi - -rm -f allout +if [ "$1" = "-k" ] +then + shift + numKeys="$1" + shift +fi # Get quoting right... worker () { set -e set -o pipefail + mkdir -p logs f=$1 shift j=$1 @@ -32,11 +38,15 @@ filter="$1" shift shift # we don't need/want the resType either - echo $(date) $(hostname) start $f >>${j}_log + me=$(hostname | cut -c 15) + ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') + echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log export PYTHONIOENCODING=utf-8 - { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ - unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log - echo $(date) $(hostname) finished $f >>${j}_log + { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ + unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1 + { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff + printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to + # guarantee atomic entry in the log } export -f worker @@ -50,5 +60,6 @@ --workdir $wd \ -a input_paths \ --env worker \ - worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\ - sac_reducer.py "$@" + --return 'logs/{#}_log' --cleanup \ + worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\ + sac_reducer.py $1 $numKeys