Mercurial > hg > cc > cirrus_work
comparison bin/_c2t.sh @ 214:94072b090fdd
csing-related tweaks
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Feb 2024 22:30:40 +0000 |
parents | 1ce51aacc468 |
children |
comparison
equal
deleted
inserted
replaced
213:443b3a6f0b41 | 214:94072b090fdd |
---|---|
13 threadsPerTask=2 | 13 threadsPerTask=2 |
14 pjobs=$((c / $threadsPerTask)) | 14 pjobs=$((c / $threadsPerTask)) |
15 | 15 |
16 cc=$1 | 16 cc=$1 |
17 shift | 17 shift |
18 resdir=$W/hst/results/$cc/$1 | 18 resdir=$W/results/$cc/$1 |
19 shift | 19 shift |
20 s1=$1 | 20 s1=$1 |
21 shift | 21 shift |
22 sn=$1 | 22 sn=$1 |
23 shift | 23 shift |
24 kf=$1 # key field for sorting | 24 kf=$1 # key field for sorting |
25 shift | 25 shift |
26 | 26 |
27 echo $(date) task $n.$task on $nodename:$N.$node start | 27 echo $(date) task $n.$task on $nodename:$N.$node $resdir start |
28 | 28 |
29 mkdir -p $resdir | 29 mkdir -p $resdir |
30 | 30 |
31 doit () { | 31 doit () { |
32 seg=$1 | 32 seg=$1 |
33 shift | 33 shift |
34 echo $(date) start $seg $task $PARALLEL_SEQ | 34 echo $(date) start $seg $task $PARALLEL_SEQ |
35 if [ -f $resdir/$seg.tsv ] | |
36 then | |
37 echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)" | |
38 exit 0 | |
39 fi | |
35 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ | 40 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ |
36 python3 $WSHARED/bin/cdx2tsv.py "$@" \ | 41 python3 $WSHARED/bin/cdx2tsv.py "$@" \ |
37 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv | 42 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv |
38 echo $(date) moving $seg $task $PARALLEL_SEQ | 43 echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv |
39 mv $TMPDIR/$seg.tsv $resdir | 44 mv $TMPDIR/$seg.tsv $resdir |
40 echo $(date) end $seg $task $PARALLEL_SEQ ;} | 45 echo $(date) end $seg $task $PARALLEL_SEQ ;} |
41 | 46 |
42 export -f doit | 47 export -f doit |
43 export cc resdir n task kf | 48 export cc resdir n task kf |
44 | 49 |
45 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" | 50 $W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" |
46 | 51 |
47 echo $(date) task $n.$task on $nodename:$N.$node end | 52 echo $(date) task $n.$task on $nodename:$N.$node end |
48 | 53 |