view master/src/wecu/run_sac.sh @ 59:8332faef25e1

get quoting and arg positions right
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 28 May 2020 09:58:38 +0000
parents a3edba8dab11
children 5fdca5baa4e9
line wrap: on
line source

#!/bin/bash
# Usage: run_sac.sh numcores hostsFilename workDir resType patType patterns
cores=$1
hosts=$2
wd=$3
shift
shift
shift
rm -f allout

# Get quoting right...
worker () {
  f=$1
  shift
  shift # we don't need/want the resType either
  hostname 1>&2
  export PYTHONIOENCODING=utf-8
  curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
   unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./sac_mapper.py "$@" 2>&1
}

export -f worker

parallel -v \
    --sshloginfile $hosts \
    --retries 3 \
    --transferfile $(which sac_mapper.py|sed 's/sac_/.\/sac_/') \
    --will-cite \
    --jobs $cores \
    --workdir $wd \
    -a input_paths \
    --env worker \
    worker '{}' "$@" | tee -a allout | grep -v 'Authorized uses only' | \
    sac_reducer.py "$@"