Mercurial > hg > cc > cirrus_work
annotate bin/_s2t.sh @ 2:b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 18 Jul 2022 19:22:42 +0100 |
parents | |
children | f035d36cec45 |
rev | line source |
---|---|
2
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 N=$SLURM_JOB_NUM_NODES |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 n=$SLURM_NTASKS |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 c=$SLURM_CPUS_PER_TASK |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 nodename=$SLURMD_NODENAME |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 local=$SLURM_LOCALID |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 node=$SLURM_NODEID |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 tPerN=$((n / N)) |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 task=$((local + (node * tPerN))) |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 threadsPerTask=2 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 pjobs=$((c / $threadsPerTask)) |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 cc=$1 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 resdir=$W/hst/results/$1/$2 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 s1=$3 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 sn=$4 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 echo $(date) task $n.$task on $nodename:$N.$node start |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 mkdir -p $resdir |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 mkdir -p /dev/shm/hst |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;} |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 export -f doit |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 export cc resdir n task |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \ |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 parallel -j $pjobs doit '{}' |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 echo $(date) task $n.$task on $nodename:$N.$node end |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 |