comparison master/src/wecu/run_sac.sh @ 63:d46c8b12fc04

support multiple approaches to key combination, use local files to collect results
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 16:40:34 +0000
parents 892e1c0240e1
children b91e44355bbf
comparison
equal deleted inserted replaced
62:892e1c0240e1 63:d46c8b12fc04
1 #!/bin/bash 1 #!/bin/bash
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns
3 echo "$@" 1>cmd 3 echo "$@" 1>cmd
4 cores=$1 4 cores=$1
5 hosts=$2 5 hosts=$2
6 wd=$3 6 wd=$3
7 mapper=$4 7 mapper=$4
27 # Get quoting right... 27 # Get quoting right...
28 worker () { 28 worker () {
29 set -e 29 set -e
30 set -o pipefail 30 set -o pipefail
31 mkdir -p logs 31 mkdir -p logs
32 mkdir -p res
32 f=$1 33 f=$1
33 shift 34 shift
34 j=$1 35 j=$1
35 shift 36 shift
36 mapper="$1" 37 mapper="$1"
41 me=$(hostname | cut -c 15) 42 me=$(hostname | cut -c 15)
42 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') 43 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
43 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log 44 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
44 export PYTHONIOENCODING=utf-8 45 export PYTHONIOENCODING=utf-8
45 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ 46 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
46 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1 47 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
47 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff 48 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
48 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to 49 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
49 # guarantee atomic entry in the log 50 # guarantee atomic entry in the log
50 } 51 }
51 52
58 --will-cite \ 59 --will-cite \
59 --jobs $cores \ 60 --jobs $cores \
60 --workdir $wd \ 61 --workdir $wd \
61 -a input_paths \ 62 -a input_paths \
62 --env worker \ 63 --env worker \
63 --return 'logs/{#}_log' --cleanup \ 64 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
64 worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\ 65 worker '{}' '{#}' "$mapper" "$filter" "$@"
65 sac_reducer.py $1 $numKeys 66 cat res/*.tsv | sac_reducer.py $1 $numKeys