Mercurial > hg > cc > cirrus_work
comparison bin/_s2t.sh @ 5:f035d36cec45
tidy up and include uniq -c
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 20 Jul 2022 19:39:41 +0100 |
parents | b4801f5696b2 |
children |
comparison
equal
deleted
inserted
replaced
4:f27061e8a9da | 5:f035d36cec45 |
---|---|
5 n=$SLURM_NTASKS | 5 n=$SLURM_NTASKS |
6 c=$SLURM_CPUS_PER_TASK | 6 c=$SLURM_CPUS_PER_TASK |
7 nodename=$SLURMD_NODENAME | 7 nodename=$SLURMD_NODENAME |
8 local=$SLURM_LOCALID | 8 local=$SLURM_LOCALID |
9 node=$SLURM_NODEID | 9 node=$SLURM_NODEID |
10 | 10 task=$SLURM_PROCID |
11 tPerN=$((n / N)) | |
12 | |
13 task=$((local + (node * tPerN))) | |
14 | 11 |
15 threadsPerTask=2 | 12 threadsPerTask=2 |
16 pjobs=$((c / $threadsPerTask)) | 13 pjobs=$((c / $threadsPerTask)) |
17 | 14 |
18 | |
19 cc=$1 | 15 cc=$1 |
20 resdir=$W/hst/results/$1/$2 | 16 resdir=$W/hst/results/$cc/$2 |
21 s1=$3 | 17 s1=$3 |
22 sn=$4 | 18 sn=$4 |
23 | 19 |
24 echo $(date) task $n.$task on $nodename:$N.$node start | 20 echo $(date) task $n.$task on $nodename:$N.$node start |
25 | 21 |
26 mkdir -p $resdir | 22 mkdir -p $resdir |
27 mkdir -p /dev/shm/hst | |
28 | 23 |
29 doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;} | 24 doit () { |
25 echo $(date) start $1 $task $PARALLEL_SEQ | |
26 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \ | |
27 python3 $WSHARED/bin/cdx2tsv.py \ | |
28 '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \ | |
29 '(filename,f.split("/",maxsplit=5)[4][0])' \ | |
30 '(key,key.split(",")[0])' \ | |
31 languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv | |
32 echo $(date) moving $1 $task $PARALLEL_SEQ | |
33 mv $TMPDIR/$1.tsv $resdir | |
34 echo $(date) end $1 $task $PARALLEL_SEQ ;} | |
30 | 35 |
31 export -f doit | 36 export -f doit |
32 export cc resdir n task | 37 export cc resdir n task |
33 | 38 |
34 seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \ | 39 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' |
35 parallel -j $pjobs doit '{}' | |
36 | 40 |
37 echo $(date) task $n.$task on $nodename:$N.$node end | 41 echo $(date) task $n.$task on $nodename:$N.$node end |
38 | 42 |