Mercurial > hg > cc > azure
changeset 64:b91e44355bbf
fix minor argument passing snafus
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 22:08:01 +0000 |
parents | d46c8b12fc04 |
children | e1f61f94b196 |
files | master/src/wecu/run_sac.sh master/src/wecu/sac_reducer.py master/src/wecu/wecu.py |
diffstat | 3 files changed, 23 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh Wed Jun 03 16:40:34 2020 +0000 +++ b/master/src/wecu/run_sac.sh Wed Jun 03 22:08:01 2020 +0000 @@ -1,5 +1,5 @@ #!/bin/bash -# Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns +# Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns echo "$@" 1>cmd cores=$1 hosts=$2 @@ -9,6 +9,12 @@ shift shift shift +if [ "$1" = "-h" ] +then + shift + keyHandler="$1" + shift +fi if [ "$1" = "-f" ] then shift @@ -38,13 +44,15 @@ shift filter="$1" shift + keyHandler="$1" + shift shift # we don't need/want the resType either me=$(hostname | cut -c 15) ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log export PYTHONIOENCODING=utf-8 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ - unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } + unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to # guarantee atomic entry in the log @@ -52,6 +60,9 @@ export -f worker +echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 + +date 1>&2 parallel \ --sshloginfile $hosts \ --retries 3 \ @@ -62,5 +73,7 @@ -a input_paths \ --env worker \ --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ - worker '{}' '{#}' "$mapper" "$filter" "$@" + worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" +res=$? +echo $(date) $res cat res/*.tsv | sac_reducer.py $1 $numKeys
--- a/master/src/wecu/sac_reducer.py Wed Jun 03 16:40:34 2020 +0000 +++ b/master/src/wecu/sac_reducer.py Wed Jun 03 22:08:01 2020 +0000 @@ -7,8 +7,8 @@ import sys -#print('reducing',sys.argv,file=sys.stderr) -#sys.stderr.flush() +print('reducing',sys.argv,file=sys.stderr) +sys.stderr.flush() rtype=sys.argv[1] numKeys=int(sys.argv[2]) if len(sys.argv)==3 else 1 @@ -52,10 +52,10 @@ print('bogus',line,ll,file=sys.stderr) continue -# print('nc',len(res),file=sys.stderr) -# if numKeys>1: -# print(' ',list(res.keys()),"\n ", -# list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()), file=sys.stderr) + print('nc',len(res),file=sys.stderr) + if numKeys>1: + print(' ',list(res.keys()),"\n ", + list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()), file=sys.stderr) if rtype=='dict': print('res=',end='') from pprint import pprint
--- a/master/src/wecu/wecu.py Wed Jun 03 16:40:34 2020 +0000 +++ b/master/src/wecu/wecu.py Wed Jun 03 22:08:01 2020 +0000 @@ -112,7 +112,7 @@ ('sac_mapper.py' if args.mapper is None else args.mapper), ('' if args.keyHandler is None - else "-h %s"%args.keyHandler) + else "-h %s"%args.keyHandler), ('' if args.filter is None else "-f '%s'"%args.filter), ('' if args.numKeys is None