annotate bin/_s2t.sh @ 2:b4801f5696b2

compute node workers, see cirrus_home/bin repo for login node masters
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 19:22:42 +0100
parents
children f035d36cec45
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 N=$SLURM_JOB_NUM_NODES
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 n=$SLURM_NTASKS
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 c=$SLURM_CPUS_PER_TASK
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 nodename=$SLURMD_NODENAME
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 local=$SLURM_LOCALID
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 node=$SLURM_NODEID
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 tPerN=$((n / N))
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 task=$((local + (node * tPerN)))
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 threadsPerTask=2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 pjobs=$((c / $threadsPerTask))
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 cc=$1
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 resdir=$W/hst/results/$1/$2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 s1=$3
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 sn=$4
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 echo $(date) task $n.$task on $nodename:$N.$node start
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 mkdir -p $resdir
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 mkdir -p /dev/shm/hst
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;}
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 export -f doit
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 export cc resdir n task
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 parallel -j $pjobs doit '{}'
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 echo $(date) task $n.$task on $nodename:$N.$node end
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38