comparison bin/_s2t.sh @ 5:f035d36cec45

tidy up and include uniq -c
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 20 Jul 2022 19:39:41 +0100
parents b4801f5696b2
children
comparison
equal deleted inserted replaced
4:f27061e8a9da 5:f035d36cec45
5 n=$SLURM_NTASKS 5 n=$SLURM_NTASKS
6 c=$SLURM_CPUS_PER_TASK 6 c=$SLURM_CPUS_PER_TASK
7 nodename=$SLURMD_NODENAME 7 nodename=$SLURMD_NODENAME
8 local=$SLURM_LOCALID 8 local=$SLURM_LOCALID
9 node=$SLURM_NODEID 9 node=$SLURM_NODEID
10 10 task=$SLURM_PROCID
11 tPerN=$((n / N))
12
13 task=$((local + (node * tPerN)))
14 11
15 threadsPerTask=2 12 threadsPerTask=2
16 pjobs=$((c / $threadsPerTask)) 13 pjobs=$((c / $threadsPerTask))
17 14
18
19 cc=$1 15 cc=$1
20 resdir=$W/hst/results/$1/$2 16 resdir=$W/hst/results/$cc/$2
21 s1=$3 17 s1=$3
22 sn=$4 18 sn=$4
23 19
24 echo $(date) task $n.$task on $nodename:$N.$node start 20 echo $(date) task $n.$task on $nodename:$N.$node start
25 21
26 mkdir -p $resdir 22 mkdir -p $resdir
27 mkdir -p /dev/shm/hst
28 23
29 doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;} 24 doit () {
25 echo $(date) start $1 $task $PARALLEL_SEQ
26 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \
27 python3 $WSHARED/bin/cdx2tsv.py \
28 '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \
29 '(filename,f.split("/",maxsplit=5)[4][0])' \
30 '(key,key.split(",")[0])' \
31 languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv
32 echo $(date) moving $1 $task $PARALLEL_SEQ
33 mv $TMPDIR/$1.tsv $resdir
34 echo $(date) end $1 $task $PARALLEL_SEQ ;}
30 35
31 export -f doit 36 export -f doit
32 export cc resdir n task 37 export cc resdir n task
33 38
34 seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \ 39 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
35 parallel -j $pjobs doit '{}'
36 40
37 echo $(date) task $n.$task on $nodename:$N.$node end 41 echo $(date) task $n.$task on $nodename:$N.$node end
38 42