annotate bin/_c2t.sh @ 214:94072b090fdd

csing-related tweaks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Feb 2024 22:30:40 +0000
parents 1ce51aacc468
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Args: CCmonth resSubdir s0 sn kf fieldSpecs...
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 N=$SLURM_JOB_NUM_NODES
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 n=$SLURM_NTASKS
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 c=$SLURM_CPUS_PER_TASK
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 nodename=$SLURMD_NODENAME
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 local=$SLURM_LOCALID
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 node=$SLURM_NODEID
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 task=$SLURM_PROCID
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 threadsPerTask=2
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 pjobs=$((c / $threadsPerTask))
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 cc=$1
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 shift
214
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
18 resdir=$W/results/$cc/$1
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
20 s1=$1
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
22 sn=$1
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
24 kf=$1 # key field for sorting
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 shift
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26
214
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
27 echo $(date) task $n.$task on $nodename:$N.$node $resdir start
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 mkdir -p $resdir
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 doit () {
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 seg=$1
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 shift
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 echo $(date) start $seg $task $PARALLEL_SEQ
214
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
35 if [ -f $resdir/$seg.tsv ]
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
36 then
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
37 echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)"
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
38 exit 0
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
39 fi
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 python3 $WSHARED/bin/cdx2tsv.py "$@" \
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
214
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
43 echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 mv $TMPDIR/$seg.tsv $resdir
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 echo $(date) end $seg $task $PARALLEL_SEQ ;}
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 export -f doit
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
48 export cc resdir n task kf
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49
214
94072b090fdd csing-related tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 12
diff changeset
50 $W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@"
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 echo $(date) task $n.$task on $nodename:$N.$node end
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53