comparison bin/_c2t.sh @ 11:dfdb95e5d774

catch-up
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Jul 2022 14:45:35 +0100
parents
children 1ce51aacc468
comparison
equal deleted inserted replaced
10:cf982df00cbd 11:dfdb95e5d774
1 #!/bin/bash
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
3 # Args: CCmonth resSubdir s0 sn kf fieldSpecs...
4
5 N=$SLURM_JOB_NUM_NODES
6 n=$SLURM_NTASKS
7 c=$SLURM_CPUS_PER_TASK
8 nodename=$SLURMD_NODENAME
9 local=$SLURM_LOCALID
10 node=$SLURM_NODEID
11 task=$SLURM_PROCID
12
13 threadsPerTask=2
14 pjobs=$((c / $threadsPerTask))
15
16 cc=$1
17 shift
18 resdir=$W/hst/results/$cc/$2
19 shift
20 s1=$3
21 shift
22 sn=$4
23 shift
24 kf=$5 # key field for sorting
25 shift
26
27 echo $(date) task $n.$task on $nodename:$N.$node start
28
29 mkdir -p $resdir
30
31 doit () {
32 seg=$1
33 shift
34 echo $(date) start $seg $task $PARALLEL_SEQ
35 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
36 python3 $WSHARED/bin/cdx2tsv.py "$@" \
37 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
38 echo $(date) moving $seg $task $PARALLEL_SEQ
39 mv $TMPDIR/$seg.tsv $resdir
40 echo $(date) end $seg $task $PARALLEL_SEQ ;}
41
42 export -f doit
43 export cc resdir n task
44
45 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' "$@"
46
47 echo $(date) task $n.$task on $nodename:$N.$node end
48