annotate bin/_c2t.sh @ 53:0dc144bd027c

made 1 mean 1, still losing after a while
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 07 Jul 2023 19:04:16 +0100
parents 1ce51aacc468
children 94072b090fdd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Args: CCmonth resSubdir s0 sn kf fieldSpecs...
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 N=$SLURM_JOB_NUM_NODES
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 n=$SLURM_NTASKS
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 c=$SLURM_CPUS_PER_TASK
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 nodename=$SLURMD_NODENAME
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 local=$SLURM_LOCALID
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 node=$SLURM_NODEID
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 task=$SLURM_PROCID
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 threadsPerTask=2
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 pjobs=$((c / $threadsPerTask))
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 cc=$1
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
18 resdir=$W/hst/results/$cc/$1
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
20 s1=$1
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
22 sn=$1
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 shift
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
24 kf=$1 # key field for sorting
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 shift
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 echo $(date) task $n.$task on $nodename:$N.$node start
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 mkdir -p $resdir
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 doit () {
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 seg=$1
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 shift
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 echo $(date) start $seg $task $PARALLEL_SEQ
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 python3 $WSHARED/bin/cdx2tsv.py "$@" \
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 echo $(date) moving $seg $task $PARALLEL_SEQ
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 mv $TMPDIR/$seg.tsv $resdir
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 echo $(date) end $seg $task $PARALLEL_SEQ ;}
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 export -f doit
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
43 export cc resdir n task kf
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44
12
1ce51aacc468 fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
45 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@"
11
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 echo $(date) task $n.$task on $nodename:$N.$node end
dfdb95e5d774 catch-up
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48