view master/src/wecu/run_sac.sh @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents 5fdca5baa4e9
children 892e1c0240e1
line wrap: on
line source

#!/bin/bash
# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns
cores=$1
hosts=$2
wd=$3
mapper=$4
shift
shift
shift
shift
if [ "$1" = "-f" ]
then
 shift
 filter="$1"
 shift
else
 filter=\"\"
fi

rm -f allout

# Get quoting right...
worker () {
  set -e
  set -o pipefail
  f=$1
  shift
  j=$1
  shift
  mapper="$1"
  shift
  filter="$1"
  shift
  shift # we don't need/want the resType either
  echo $(date) $(hostname) start $f >>${j}_log
  export PYTHONIOENCODING=utf-8
  { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log
  echo $(date) $(hostname) finished $f >>${j}_log
}

export -f worker

parallel \
    --sshloginfile $hosts \
    --retries 3 \
    --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
    --will-cite \
    --jobs $cores \
    --workdir $wd \
    -a input_paths \
    --env worker \
    worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\
    sac_reducer.py "$@"