Mercurial > hg > cc > azure
view master/src/wecu/run_sac.sh @ 64:b91e44355bbf
fix minor argument passing snafus
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 22:08:01 +0000 |
parents | d46c8b12fc04 |
children | e1f61f94b196 |
line wrap: on
line source
#!/bin/bash # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns echo "$@" 1>cmd cores=$1 hosts=$2 wd=$3 mapper=$4 shift shift shift shift if [ "$1" = "-h" ] then shift keyHandler="$1" shift fi if [ "$1" = "-f" ] then shift filter="$1" shift else filter=\"\" fi if [ "$1" = "-k" ] then shift numKeys="$1" shift fi # Get quoting right... worker () { set -e set -o pipefail mkdir -p logs mkdir -p res f=$1 shift j=$1 shift mapper="$1" shift filter="$1" shift keyHandler="$1" shift shift # we don't need/want the resType either me=$(hostname | cut -c 15) ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log export PYTHONIOENCODING=utf-8 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to # guarantee atomic entry in the log } export -f worker echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 date 1>&2 parallel \ --sshloginfile $hosts \ --retries 3 \ --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ --will-cite \ --jobs $cores \ --workdir $wd \ -a input_paths \ --env worker \ --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" res=$? echo $(date) $res cat res/*.tsv | sac_reducer.py $1 $numKeys