changeset 53:396d1f17c671

ready to try another pass with robust diff checking
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 13 Apr 2020 15:24:32 +0100
parents 9cd9daf75183
children 8154560f1e3d
files bin/nogood.py bin/preExtract.sh
diffstat 2 files changed, 37 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/nogood.py	Mon Apr 13 14:12:12 2020 +0100
+++ b/bin/nogood.py	Mon Apr 13 15:24:32 2020 +0100
@@ -3,7 +3,12 @@
 Usage: nogood.py segid numChanged
 Note the slightly counter-intuitive value: 0 if the input is no good'''
 import sys,re
-LONG_ADD=re.compile('[1-9][0-9]*a[1-9][0-9]*,[1-9][0-9]*')
+shortPat='[1-9][0-9]*%s[1-9][0-9]*'
+SHORT_ADD=re.compile(shortPat%'a')
+SHORT_CHANGE=re.compile(shortPat%'c')
+longPat=shortPat+',[1-9][0-9]*'
+LONG_ADD=re.compile(longPat%'a')
+LONG_CHANGE=re.compile(longPat%'c')
 (segment,n)=map(int,sys.argv[1:])
 c=0
 for l in sys.stdin:
@@ -11,10 +16,35 @@
     if LONG_ADD.fullmatch(d):
         c+=1
         continue
-    else:
-        print("non-simple diff at %s: %s for segment %s"%(i,d,segment),file=sys.stderr)
-        exit(0)
+    elif SHORT_ADD.fullmatch(d):
+        # as in 1566027315865.44/40_diff.txt
+        try:
+            (j,e)=sys.stdin.readline().rstrip().split(':')
+        except ValueError:
+            # EOF, I think
+            # OK, maybe
+            c+=1
+            continue
+        if int(j)==int(i)+2 and LONG_CHANGE.fullmatch(e):
+            c+=1
+            continue
+    elif SHORT_CHANGE.fullmatch(d):
+        # as in 1566027315865.44/40_diff.txt
+        try:
+            (j,e)=sys.stdin.readline().rstrip().split(':')
+        except ValueError:
+            # EOF, I think
+            # OK, maybe
+            c+=1
+            continue
+        if int(j)==int(i)+4 and LONG_ADD.fullmatch(e):
+            c+=1
+            continue
+    print("can't handle diff at %s: %s for segment %s"%(i,d,segment),
+          file=sys.stderr)
+    exit(0)
 if c!=n:
-    print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment),file=sys.stderr)
+    print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment),
+          file=sys.stderr)
     exit(0)
 exit(1)
--- a/bin/preExtract.sh	Mon Apr 13 14:12:12 2020 +0100
+++ b/bin/preExtract.sh	Mon Apr 13 15:24:32 2020 +0100
@@ -84,7 +84,8 @@
 	  ni=${#ii[@]}
 	  if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
 	  then
-            if egrep -n '^[1-9]' ${e}_diff.txt | $HOME/bin/nogood.py $e $ni
+            if egrep -n '^[1-9]' ${e}_diff.txt | \
+                  $HOME/bin/nogood.py $e $ni 2>> log
             then
               continue
             fi