Mercurial > hg > cc > cirrus_work
annotate bin/_c2t.sh @ 53:0dc144bd027c
made 1 mean 1, still losing after a while
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Jul 2023 19:04:16 +0100 |
parents | 1ce51aacc468 |
children | 94072b090fdd |
rev | line source |
---|---|
11 | 1 #!/bin/bash |
2 # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz | |
3 # Args: CCmonth resSubdir s0 sn kf fieldSpecs... | |
4 | |
5 N=$SLURM_JOB_NUM_NODES | |
6 n=$SLURM_NTASKS | |
7 c=$SLURM_CPUS_PER_TASK | |
8 nodename=$SLURMD_NODENAME | |
9 local=$SLURM_LOCALID | |
10 node=$SLURM_NODEID | |
11 task=$SLURM_PROCID | |
12 | |
13 threadsPerTask=2 | |
14 pjobs=$((c / $threadsPerTask)) | |
15 | |
16 cc=$1 | |
17 shift | |
12
1ce51aacc468
fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
11
diff
changeset
|
18 resdir=$W/hst/results/$cc/$1 |
11 | 19 shift |
12
1ce51aacc468
fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
11
diff
changeset
|
20 s1=$1 |
11 | 21 shift |
12
1ce51aacc468
fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
11
diff
changeset
|
22 sn=$1 |
11 | 23 shift |
12
1ce51aacc468
fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
11
diff
changeset
|
24 kf=$1 # key field for sorting |
11 | 25 shift |
26 | |
27 echo $(date) task $n.$task on $nodename:$N.$node start | |
28 | |
29 mkdir -p $resdir | |
30 | |
31 doit () { | |
32 seg=$1 | |
33 shift | |
34 echo $(date) start $seg $task $PARALLEL_SEQ | |
35 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ | |
36 python3 $WSHARED/bin/cdx2tsv.py "$@" \ | |
37 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv | |
38 echo $(date) moving $seg $task $PARALLEL_SEQ | |
39 mv $TMPDIR/$seg.tsv $resdir | |
40 echo $(date) end $seg $task $PARALLEL_SEQ ;} | |
41 | |
42 export -f doit | |
12
1ce51aacc468
fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
11
diff
changeset
|
43 export cc resdir n task kf |
11 | 44 |
12
1ce51aacc468
fix quoting pblm by using parallel ... -q
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
11
diff
changeset
|
45 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" |
11 | 46 |
47 echo $(date) task $n.$task on $nodename:$N.$node end | |
48 |