Mercurial > hg > cc > azure
comparison master/src/wecu/run_sac.sh @ 63:d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 16:40:34 +0000 |
parents | 892e1c0240e1 |
children | b91e44355bbf |
comparison
equal
deleted
inserted
replaced
62:892e1c0240e1 | 63:d46c8b12fc04 |
---|---|
1 #!/bin/bash | 1 #!/bin/bash |
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns | 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns |
3 echo "$@" 1>cmd | 3 echo "$@" 1>cmd |
4 cores=$1 | 4 cores=$1 |
5 hosts=$2 | 5 hosts=$2 |
6 wd=$3 | 6 wd=$3 |
7 mapper=$4 | 7 mapper=$4 |
27 # Get quoting right... | 27 # Get quoting right... |
28 worker () { | 28 worker () { |
29 set -e | 29 set -e |
30 set -o pipefail | 30 set -o pipefail |
31 mkdir -p logs | 31 mkdir -p logs |
32 mkdir -p res | |
32 f=$1 | 33 f=$1 |
33 shift | 34 shift |
34 j=$1 | 35 j=$1 |
35 shift | 36 shift |
36 mapper="$1" | 37 mapper="$1" |
41 me=$(hostname | cut -c 15) | 42 me=$(hostname | cut -c 15) |
42 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') | 43 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') |
43 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log | 44 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log |
44 export PYTHONIOENCODING=utf-8 | 45 export PYTHONIOENCODING=utf-8 |
45 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ | 46 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
46 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1 | 47 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } |
47 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff | 48 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff |
48 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to | 49 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to |
49 # guarantee atomic entry in the log | 50 # guarantee atomic entry in the log |
50 } | 51 } |
51 | 52 |
58 --will-cite \ | 59 --will-cite \ |
59 --jobs $cores \ | 60 --jobs $cores \ |
60 --workdir $wd \ | 61 --workdir $wd \ |
61 -a input_paths \ | 62 -a input_paths \ |
62 --env worker \ | 63 --env worker \ |
63 --return 'logs/{#}_log' --cleanup \ | 64 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ |
64 worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\ | 65 worker '{}' '{#}' "$mapper" "$filter" "$@" |
65 sac_reducer.py $1 $numKeys | 66 cat res/*.tsv | sac_reducer.py $1 $numKeys |