annotate bin/_s2t.sh @ 243:7bef91ca3d51

make into a library, entry point def unpackz(infileName, callback, outfile = None), moved to python/lib/cc
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 02 Oct 2024 19:54:45 +0100
parents f035d36cec45
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 N=$SLURM_JOB_NUM_NODES
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 n=$SLURM_NTASKS
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 c=$SLURM_CPUS_PER_TASK
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 nodename=$SLURMD_NODENAME
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 local=$SLURM_LOCALID
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 node=$SLURM_NODEID
5
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
10 task=$SLURM_PROCID
2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 threadsPerTask=2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 pjobs=$((c / $threadsPerTask))
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 cc=$1
5
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
16 resdir=$W/hst/results/$cc/$2
2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 s1=$3
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 sn=$4
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 echo $(date) task $n.$task on $nodename:$N.$node start
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 mkdir -p $resdir
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
5
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
24 doit () {
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
25 echo $(date) start $1 $task $PARALLEL_SEQ
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
26 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
27 python3 $WSHARED/bin/cdx2tsv.py \
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
28 '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
29 '(filename,f.split("/",maxsplit=5)[4][0])' \
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
30 '(key,key.split(",")[0])' \
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
31 languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
32 echo $(date) moving $1 $task $PARALLEL_SEQ
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
33 mv $TMPDIR/$1.tsv $resdir
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
34 echo $(date) end $1 $task $PARALLEL_SEQ ;}
2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 export -f doit
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 export cc resdir n task
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38
5
f035d36cec45 tidy up and include uniq -c
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
39 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
2
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 echo $(date) task $n.$task on $nodename:$N.$node end
b4801f5696b2 compute node workers, see cirrus_home/bin repo for login node masters
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42