changeset 64:b91e44355bbf

fix minor argument passing snafus
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 22:08:01 +0000
parents d46c8b12fc04
children e1f61f94b196
files master/src/wecu/run_sac.sh master/src/wecu/sac_reducer.py master/src/wecu/wecu.py
diffstat 3 files changed, 23 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh	Wed Jun 03 16:40:34 2020 +0000
+++ b/master/src/wecu/run_sac.sh	Wed Jun 03 22:08:01 2020 +0000
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns
+# Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns
 echo "$@" 1>cmd
 cores=$1
 hosts=$2
@@ -9,6 +9,12 @@
 shift
 shift
 shift
+if [ "$1" = "-h" ]
+then
+ shift
+ keyHandler="$1"
+ shift
+fi
 if [ "$1" = "-f" ]
 then
  shift
@@ -38,13 +44,15 @@
   shift
   filter="$1"
   shift
+  keyHandler="$1"
+  shift
   shift # we don't need/want the resType either
   me=$(hostname | cut -c 15)
   ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
   echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
   export PYTHONIOENCODING=utf-8
   { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
-   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
+   unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
   { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
     printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
       # guarantee atomic entry in the log
@@ -52,6 +60,9 @@
 
 export -f worker
 
+echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2
+
+date 1>&2
 parallel \
     --sshloginfile $hosts \
     --retries 3 \
@@ -62,5 +73,7 @@
     -a input_paths \
     --env worker \
     --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
-    worker '{}' '{#}' "$mapper" "$filter" "$@"
+    worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@"
+res=$?
+echo $(date) $res
 cat res/*.tsv | sac_reducer.py $1 $numKeys
--- a/master/src/wecu/sac_reducer.py	Wed Jun 03 16:40:34 2020 +0000
+++ b/master/src/wecu/sac_reducer.py	Wed Jun 03 22:08:01 2020 +0000
@@ -7,8 +7,8 @@
 
 import sys
 
-#print('reducing',sys.argv,file=sys.stderr)
-#sys.stderr.flush()
+print('reducing',sys.argv,file=sys.stderr)
+sys.stderr.flush()
 
 rtype=sys.argv[1]
 numKeys=int(sys.argv[2]) if len(sys.argv)==3 else 1
@@ -52,10 +52,10 @@
             print('bogus',line,ll,file=sys.stderr)
             continue
 
-#    print('nc',len(res),file=sys.stderr)
-#    if numKeys>1:
-#        print(' ',list(res.keys()),"\n ",
-#              list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()), file=sys.stderr)
+    print('nc',len(res),file=sys.stderr)
+    if numKeys>1:
+        print(' ',list(res.keys()),"\n ",
+              list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()), file=sys.stderr)
     if rtype=='dict':
         print('res=',end='')
         from pprint import pprint
--- a/master/src/wecu/wecu.py	Wed Jun 03 16:40:34 2020 +0000
+++ b/master/src/wecu/wecu.py	Wed Jun 03 22:08:01 2020 +0000
@@ -112,7 +112,7 @@
         ('sac_mapper.py' if args.mapper is None
          else args.mapper),
         ('' if args.keyHandler is None
-         else "-h %s"%args.keyHandler)
+         else "-h %s"%args.keyHandler),
         ('' if args.filter is None
          else "-f '%s'"%args.filter),
         ('' if args.numKeys is None