changeset 52:9cd9daf75183

working towards more robust diff checking
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 13 Apr 2020 14:12:12 +0100
parents 427056f1784e
children 396d1f17c671
files bin/nogood.py bin/preExtract.sh
diffstat 2 files changed, 24 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/nogood.py	Mon Apr 13 14:12:12 2020 +0100
@@ -0,0 +1,20 @@
+#!/lustre/sw/miniconda3/bin/python3
+'''Detect bad diffs between warc.sh log file and existing extract_...tar
+Usage: nogood.py segid numChanged
+Note the slightly counter-intuitive value: 0 if the input is no good'''
+import sys,re
+LONG_ADD=re.compile('[1-9][0-9]*a[1-9][0-9]*,[1-9][0-9]*')
+(segment,n)=map(int,sys.argv[1:])
+c=0
+for l in sys.stdin:
+    (i,d)=l.rstrip().split(':')
+    if LONG_ADD.fullmatch(d):
+        c+=1
+        continue
+    else:
+        print("non-simple diff at %s: %s for segment %s"%(i,d,segment),file=sys.stderr)
+        exit(0)
+if c!=n:
+    print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment),file=sys.stderr)
+    exit(0)
+exit(1)
--- a/bin/preExtract.sh	Sat Apr 11 13:41:46 2020 +0100
+++ b/bin/preExtract.sh	Mon Apr 13 14:12:12 2020 +0100
@@ -84,18 +84,10 @@
 	  ni=${#ii[@]}
 	  if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
 	  then
-              if [[ "$(tr -s "\n\t " " " < ${e}_check.txt)" =~ \
-           ^" "[0-9]*" > 1 < 1 --- 1 "[0-9]*c[0-9]*" 1 "[0-9]*"a"[0-9,]*" "$ ]]
-              then
-               :
-              else
-		echo " " "extra lines in ${e}_check.txt" >> log
-		continue
-              fi
-	  elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
-	  then
-	      echo " " "non-addition lines in ${e}_check.txt" >> log
-	      continue
+            if egrep -n '^[1-9]' ${e}_diff.txt | $HOME/bin/nogood.py $e $ni
+            then
+              continue
+            fi
 	  fi
           echo " " starting tar update >> log
 	  egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt