# HG changeset patch # User Henry S. Thompson # Date 1586783532 -3600 # Node ID 9cd9daf7518303dc94db804768759d97efc23178 # Parent 427056f1784eb7739039b5748acdd6173a4fb6b5 working towards more robust diff checking diff -r 427056f1784e -r 9cd9daf75183 bin/nogood.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/nogood.py Mon Apr 13 14:12:12 2020 +0100 @@ -0,0 +1,20 @@ +#!/lustre/sw/miniconda3/bin/python3 +'''Detect bad diffs between warc.sh log file and existing extract_...tar +Usage: nogood.py segid numChanged +Note the slightly counter-intuitive value: 0 if the input is no good''' +import sys,re +LONG_ADD=re.compile('[1-9][0-9]*a[1-9][0-9]*,[1-9][0-9]*') +(segment,n)=map(int,sys.argv[1:]) +c=0 +for l in sys.stdin: + (i,d)=l.rstrip().split(':') + if LONG_ADD.fullmatch(d): + c+=1 + continue + else: + print("non-simple diff at %s: %s for segment %s"%(i,d,segment),file=sys.stderr) + exit(0) +if c!=n: + print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment),file=sys.stderr) + exit(0) +exit(1) diff -r 427056f1784e -r 9cd9daf75183 bin/preExtract.sh --- a/bin/preExtract.sh Sat Apr 11 13:41:46 2020 +0100 +++ b/bin/preExtract.sh Mon Apr 13 14:12:12 2020 +0100 @@ -84,18 +84,10 @@ ni=${#ii[@]} if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] then - if [[ "$(tr -s "\n\t " " " < ${e}_check.txt)" =~ \ - ^" "[0-9]*" > 1 < 1 --- 1 "[0-9]*c[0-9]*" 1 "[0-9]*"a"[0-9,]*" "$ ]] - then - : - else - echo " " "extra lines in ${e}_check.txt" >> log - continue - fi - elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] - then - echo " " "non-addition lines in ${e}_check.txt" >> log - continue + if egrep -n '^[1-9]' ${e}_diff.txt | $HOME/bin/nogood.py $e $ni + then + continue + fi fi echo " " starting tar update >> log egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt