comparison bin/_c2t.sh @ 214:94072b090fdd

csing-related tweaks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Feb 2024 22:30:40 +0000
parents 1ce51aacc468
children
comparison
equal deleted inserted replaced
213:443b3a6f0b41 214:94072b090fdd
13 threadsPerTask=2 13 threadsPerTask=2
14 pjobs=$((c / $threadsPerTask)) 14 pjobs=$((c / $threadsPerTask))
15 15
16 cc=$1 16 cc=$1
17 shift 17 shift
18 resdir=$W/hst/results/$cc/$1 18 resdir=$W/results/$cc/$1
19 shift 19 shift
20 s1=$1 20 s1=$1
21 shift 21 shift
22 sn=$1 22 sn=$1
23 shift 23 shift
24 kf=$1 # key field for sorting 24 kf=$1 # key field for sorting
25 shift 25 shift
26 26
27 echo $(date) task $n.$task on $nodename:$N.$node start 27 echo $(date) task $n.$task on $nodename:$N.$node $resdir start
28 28
29 mkdir -p $resdir 29 mkdir -p $resdir
30 30
31 doit () { 31 doit () {
32 seg=$1 32 seg=$1
33 shift 33 shift
34 echo $(date) start $seg $task $PARALLEL_SEQ 34 echo $(date) start $seg $task $PARALLEL_SEQ
35 if [ -f $resdir/$seg.tsv ]
36 then
37 echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)"
38 exit 0
39 fi
35 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ 40 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
36 python3 $WSHARED/bin/cdx2tsv.py "$@" \ 41 python3 $WSHARED/bin/cdx2tsv.py "$@" \
37 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv 42 | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
38 echo $(date) moving $seg $task $PARALLEL_SEQ 43 echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv
39 mv $TMPDIR/$seg.tsv $resdir 44 mv $TMPDIR/$seg.tsv $resdir
40 echo $(date) end $seg $task $PARALLEL_SEQ ;} 45 echo $(date) end $seg $task $PARALLEL_SEQ ;}
41 46
42 export -f doit 47 export -f doit
43 export cc resdir n task kf 48 export cc resdir n task kf
44 49
45 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" 50 $W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@"
46 51
47 echo $(date) task $n.$task on $nodename:$N.$node end 52 echo $(date) task $n.$task on $nodename:$N.$node end
48 53