Mercurial > hg > cc > cirrus_work
changeset 214:94072b090fdd
csing-related tweaks
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Feb 2024 22:30:40 +0000 |
parents | 443b3a6f0b41 |
children | d2c4fec1ed21 |
files | bin/_c2t.sh |
diffstat | 1 files changed, 9 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/_c2t.sh Thu Feb 15 16:36:00 2024 +0000 +++ b/bin/_c2t.sh Thu Feb 15 22:30:40 2024 +0000 @@ -15,7 +15,7 @@ cc=$1 shift -resdir=$W/hst/results/$cc/$1 +resdir=$W/results/$cc/$1 shift s1=$1 shift @@ -24,7 +24,7 @@ kf=$1 # key field for sorting shift -echo $(date) task $n.$task on $nodename:$N.$node start +echo $(date) task $n.$task on $nodename:$N.$node $resdir start mkdir -p $resdir @@ -32,17 +32,22 @@ seg=$1 shift echo $(date) start $seg $task $PARALLEL_SEQ + if [ -f $resdir/$seg.tsv ] + then + echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)" + exit 0 + fi uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ python3 $WSHARED/bin/cdx2tsv.py "$@" \ | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv - echo $(date) moving $seg $task $PARALLEL_SEQ + echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv mv $TMPDIR/$seg.tsv $resdir echo $(date) end $seg $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task kf -$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" +$W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" echo $(date) task $n.$task on $nodename:$N.$node end