view bin/nogood.py @ 55:50556ac15e88

one-off to convert big extracts.tar into lots of smaller ones
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 14 Apr 2020 16:10:22 +0100
parents 396d1f17c671
children
line wrap: on
line source

#!/lustre/sw/miniconda3/bin/python3
'''Detect bad diffs between warc.sh log file and existing extract_...tar
Usage: nogood.py segid numChanged
Note the slightly counter-intuitive value: 0 if the input is no good'''
import sys,re
shortPat='[1-9][0-9]*%s[1-9][0-9]*'
SHORT_ADD=re.compile(shortPat%'a')
SHORT_CHANGE=re.compile(shortPat%'c')
longPat=shortPat+',[1-9][0-9]*'
LONG_ADD=re.compile(longPat%'a')
LONG_CHANGE=re.compile(longPat%'c')
(segment,n)=map(int,sys.argv[1:])
c=0
for l in sys.stdin:
    (i,d)=l.rstrip().split(':')
    if LONG_ADD.fullmatch(d):
        c+=1
        continue
    elif SHORT_ADD.fullmatch(d):
        # as in 1566027315865.44/40_diff.txt
        try:
            (j,e)=sys.stdin.readline().rstrip().split(':')
        except ValueError:
            # EOF, I think
            # OK, maybe
            c+=1
            continue
        if int(j)==int(i)+2 and LONG_CHANGE.fullmatch(e):
            c+=1
            continue
    elif SHORT_CHANGE.fullmatch(d):
        # as in 1566027315865.44/40_diff.txt
        try:
            (j,e)=sys.stdin.readline().rstrip().split(':')
        except ValueError:
            # EOF, I think
            # OK, maybe
            c+=1
            continue
        if int(j)==int(i)+4 and LONG_ADD.fullmatch(e):
            c+=1
            continue
    print("can't handle diff at %s: %s for segment %s"%(i,d,segment),
          file=sys.stderr)
    exit(0)
if c!=n:
    print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment),
          file=sys.stderr)
    exit(0)
exit(1)