changeset 5:f035d36cec45

tidy up and include uniq -c
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 20 Jul 2022 19:39:41 +0100
parents f27061e8a9da
children f60c03e86e40
files bin/_s2t.sh
diffstat 1 files changed, 14 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/bin/_s2t.sh	Wed Jul 20 19:38:30 2022 +0100
+++ b/bin/_s2t.sh	Wed Jul 20 19:39:41 2022 +0100
@@ -7,32 +7,36 @@
 nodename=$SLURMD_NODENAME
 local=$SLURM_LOCALID
 node=$SLURM_NODEID
-
-tPerN=$((n / N))
-
-task=$((local + (node * tPerN)))
+task=$SLURM_PROCID
 
 threadsPerTask=2
 pjobs=$((c / $threadsPerTask))
 
-
 cc=$1
-resdir=$W/hst/results/$1/$2
+resdir=$W/hst/results/$cc/$2
 s1=$3
 sn=$4
 
 echo $(date) task $n.$task on $nodename:$N.$node start
 
 mkdir -p $resdir
-mkdir -p /dev/shm/hst
 
-doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])'  '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir;  echo $(date) end $1 $task $PARALLEL_SEQ ;}
+doit () {
+ echo $(date) start $1 $task $PARALLEL_SEQ
+ uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \
+   python3 $WSHARED/bin/cdx2tsv.py \
+     '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \
+     '(filename,f.split("/",maxsplit=5)[4][0])' \
+     '(key,key.split(",")[0])' \
+     languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv
+ echo $(date) moving $1 $task $PARALLEL_SEQ 
+ mv $TMPDIR/$1.tsv $resdir
+  echo $(date) end $1 $task $PARALLEL_SEQ ;}
 
 export -f doit
 export cc resdir n task
 
-seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \
-   parallel -j $pjobs doit '{}'
+$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
 
 echo $(date) task $n.$task on $nodename:$N.$node end