Mercurial > hg > cc > cirrus_work
diff bin/_c2t.sh @ 11:dfdb95e5d774
catch-up
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 28 Jul 2022 14:45:35 +0100 |
parents | |
children | 1ce51aacc468 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_c2t.sh Thu Jul 28 14:45:35 2022 +0100 @@ -0,0 +1,48 @@ +#!/bin/bash +# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz +# Args: CCmonth resSubdir s0 sn kf fieldSpecs... + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +task=$SLURM_PROCID + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + +cc=$1 +shift +resdir=$W/hst/results/$cc/$2 +shift +s1=$3 +shift +sn=$4 +shift +kf=$5 # key field for sorting +shift + +echo $(date) task $n.$task on $nodename:$N.$node start + +mkdir -p $resdir + +doit () { + seg=$1 + shift + echo $(date) start $seg $task $PARALLEL_SEQ + uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ + python3 $WSHARED/bin/cdx2tsv.py "$@" \ + | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv + echo $(date) moving $seg $task $PARALLEL_SEQ + mv $TMPDIR/$seg.tsv $resdir + echo $(date) end $seg $task $PARALLEL_SEQ ;} + +export -f doit +export cc resdir n task + +$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' "$@" + +echo $(date) task $n.$task on $nodename:$N.$node end +