view bin/_c2t.sh @ 208:b1190db19d78

sic
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 07 Dec 2023 18:23:11 +0000
parents 1ce51aacc468
children 94072b090fdd
line wrap: on
line source

#!/bin/bash
# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
# Args: CCmonth resSubdir s0 sn kf fieldSpecs...

N=$SLURM_JOB_NUM_NODES
n=$SLURM_NTASKS
c=$SLURM_CPUS_PER_TASK
nodename=$SLURMD_NODENAME
local=$SLURM_LOCALID
node=$SLURM_NODEID
task=$SLURM_PROCID

threadsPerTask=2
pjobs=$((c / $threadsPerTask))

cc=$1
shift
resdir=$W/hst/results/$cc/$1
shift
s1=$1
shift
sn=$1
shift
kf=$1 # key field for sorting
shift

echo $(date) task $n.$task on $nodename:$N.$node start

mkdir -p $resdir

doit () {
 seg=$1
 shift
 echo $(date) start $seg $task $PARALLEL_SEQ
 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
   python3 $WSHARED/bin/cdx2tsv.py "$@" \
      | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
 echo $(date) moving $seg $task $PARALLEL_SEQ 
 mv $TMPDIR/$seg.tsv $resdir
  echo $(date) end $seg $task $PARALLEL_SEQ ;}

export -f doit
export cc resdir n task kf

$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@"

echo $(date) task $n.$task on $nodename:$N.$node end