11
|
1 #!/bin/bash
|
|
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
|
|
3 # Args: CCmonth resSubdir s0 sn kf fieldSpecs...
|
|
4
|
|
5 N=$SLURM_JOB_NUM_NODES
|
|
6 n=$SLURM_NTASKS
|
|
7 c=$SLURM_CPUS_PER_TASK
|
|
8 nodename=$SLURMD_NODENAME
|
|
9 local=$SLURM_LOCALID
|
|
10 node=$SLURM_NODEID
|
|
11 task=$SLURM_PROCID
|
|
12
|
|
13 threadsPerTask=2
|
|
14 pjobs=$((c / $threadsPerTask))
|
|
15
|
|
16 cc=$1
|
|
17 shift
|
|
18 resdir=$W/hst/results/$cc/$2
|
|
19 shift
|
|
20 s1=$3
|
|
21 shift
|
|
22 sn=$4
|
|
23 shift
|
|
24 kf=$5 # key field for sorting
|
|
25 shift
|
|
26
|
|
27 echo $(date) task $n.$task on $nodename:$N.$node start
|
|
28
|
|
29 mkdir -p $resdir
|
|
30
|
|
31 doit () {
|
|
32 seg=$1
|
|
33 shift
|
|
34 echo $(date) start $seg $task $PARALLEL_SEQ
|
|
35 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
|
|
36 python3 $WSHARED/bin/cdx2tsv.py "$@" \
|
|
37 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
|
|
38 echo $(date) moving $seg $task $PARALLEL_SEQ
|
|
39 mv $TMPDIR/$seg.tsv $resdir
|
|
40 echo $(date) end $seg $task $PARALLEL_SEQ ;}
|
|
41
|
|
42 export -f doit
|
|
43 export cc resdir n task
|
|
44
|
|
45 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' "$@"
|
|
46
|
|
47 echo $(date) task $n.$task on $nodename:$N.$node end
|
|
48
|