changeset 214:94072b090fdd

csing-related tweaks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Feb 2024 22:30:40 +0000
parents 443b3a6f0b41
children d2c4fec1ed21
files bin/_c2t.sh
diffstat 1 files changed, 9 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/_c2t.sh	Thu Feb 15 16:36:00 2024 +0000
+++ b/bin/_c2t.sh	Thu Feb 15 22:30:40 2024 +0000
@@ -15,7 +15,7 @@
 
 cc=$1
 shift
-resdir=$W/hst/results/$cc/$1
+resdir=$W/results/$cc/$1
 shift
 s1=$1
 shift
@@ -24,7 +24,7 @@
 kf=$1 # key field for sorting
 shift
 
-echo $(date) task $n.$task on $nodename:$N.$node start
+echo $(date) task $n.$task on $nodename:$N.$node $resdir start
 
 mkdir -p $resdir
 
@@ -32,17 +32,22 @@
  seg=$1
  shift
  echo $(date) start $seg $task $PARALLEL_SEQ
+ if [ -f $resdir/$seg.tsv ]
+ then
+  echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)"
+  exit 0
+ fi
  uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
    python3 $WSHARED/bin/cdx2tsv.py "$@" \
       | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
- echo $(date) moving $seg $task $PARALLEL_SEQ 
+ echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv
  mv $TMPDIR/$seg.tsv $resdir
   echo $(date) end $seg $task $PARALLEL_SEQ ;}
 
 export -f doit
 export cc resdir n task kf
 
-$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@"
+$W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@"
 
 echo $(date) task $n.$task on $nodename:$N.$node end