Mercurial > hg > cc > cirrus_work
diff bin/_s2t.sh @ 5:f035d36cec45
tidy up and include uniq -c
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 20 Jul 2022 19:39:41 +0100 |
parents | b4801f5696b2 |
children |
line wrap: on
line diff
--- a/bin/_s2t.sh Wed Jul 20 19:38:30 2022 +0100 +++ b/bin/_s2t.sh Wed Jul 20 19:39:41 2022 +0100 @@ -7,32 +7,36 @@ nodename=$SLURMD_NODENAME local=$SLURM_LOCALID node=$SLURM_NODEID - -tPerN=$((n / N)) - -task=$((local + (node * tPerN))) +task=$SLURM_PROCID threadsPerTask=2 pjobs=$((c / $threadsPerTask)) - cc=$1 -resdir=$W/hst/results/$1/$2 +resdir=$W/hst/results/$cc/$2 s1=$3 sn=$4 echo $(date) task $n.$task on $nodename:$N.$node start mkdir -p $resdir -mkdir -p /dev/shm/hst -doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;} +doit () { + echo $(date) start $1 $task $PARALLEL_SEQ + uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \ + python3 $WSHARED/bin/cdx2tsv.py \ + '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \ + '(filename,f.split("/",maxsplit=5)[4][0])' \ + '(key,key.split(",")[0])' \ + languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv + echo $(date) moving $1 $task $PARALLEL_SEQ + mv $TMPDIR/$1.tsv $resdir + echo $(date) end $1 $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task -seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \ - parallel -j $pjobs doit '{}' +$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' echo $(date) task $n.$task on $nodename:$N.$node end