Mercurial > hg > cc > cirrus_work
annotate bin/_s2t.sh @ 243:7bef91ca3d51
make into a library, entry point def unpackz(infileName, callback, outfile = None),
moved to python/lib/cc
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 02 Oct 2024 19:54:45 +0100 |
parents | f035d36cec45 |
children |
rev | line source |
---|---|
2
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 N=$SLURM_JOB_NUM_NODES |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 n=$SLURM_NTASKS |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 c=$SLURM_CPUS_PER_TASK |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 nodename=$SLURMD_NODENAME |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 local=$SLURM_LOCALID |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 node=$SLURM_NODEID |
5
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
10 task=$SLURM_PROCID |
2
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 threadsPerTask=2 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 pjobs=$((c / $threadsPerTask)) |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 cc=$1 |
5
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
16 resdir=$W/hst/results/$cc/$2 |
2
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 s1=$3 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 sn=$4 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 echo $(date) task $n.$task on $nodename:$N.$node start |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 mkdir -p $resdir |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
5
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
24 doit () { |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
25 echo $(date) start $1 $task $PARALLEL_SEQ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
26 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
27 python3 $WSHARED/bin/cdx2tsv.py \ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
28 '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
29 '(filename,f.split("/",maxsplit=5)[4][0])' \ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
30 '(key,key.split(",")[0])' \ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
31 languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
32 echo $(date) moving $1 $task $PARALLEL_SEQ |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
33 mv $TMPDIR/$1.tsv $resdir |
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
34 echo $(date) end $1 $task $PARALLEL_SEQ ;} |
2
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 export -f doit |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 export cc resdir n task |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 |
5
f035d36cec45
tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
39 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' |
2
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 echo $(date) task $n.$task on $nodename:$N.$node end |
b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 |