# HG changeset patch # User Henry S. Thompson # Date 1708036240 0 # Node ID 94072b090fddba2cfc37bcee8fff5cd2e130b2ec # Parent 443b3a6f0b418ac0d061a042a7730f14a521be7d csing-related tweaks diff -r 443b3a6f0b41 -r 94072b090fdd bin/_c2t.sh --- a/bin/_c2t.sh Thu Feb 15 16:36:00 2024 +0000 +++ b/bin/_c2t.sh Thu Feb 15 22:30:40 2024 +0000 @@ -15,7 +15,7 @@ cc=$1 shift -resdir=$W/hst/results/$cc/$1 +resdir=$W/results/$cc/$1 shift s1=$1 shift @@ -24,7 +24,7 @@ kf=$1 # key field for sorting shift -echo $(date) task $n.$task on $nodename:$N.$node start +echo $(date) task $n.$task on $nodename:$N.$node $resdir start mkdir -p $resdir @@ -32,17 +32,22 @@ seg=$1 shift echo $(date) start $seg $task $PARALLEL_SEQ + if [ -f $resdir/$seg.tsv ] + then + echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)" + exit 0 + fi uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ python3 $WSHARED/bin/cdx2tsv.py "$@" \ | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv - echo $(date) moving $seg $task $PARALLEL_SEQ + echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv mv $TMPDIR/$seg.tsv $resdir echo $(date) end $seg $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task kf -$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" +$W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" echo $(date) task $n.$task on $nodename:$N.$node end