Mercurial > hg > cc > cirrus_home
view bin/nogood.py @ 55:50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 14 Apr 2020 16:10:22 +0100 |
parents | 396d1f17c671 |
children |
line wrap: on
line source
#!/lustre/sw/miniconda3/bin/python3 '''Detect bad diffs between warc.sh log file and existing extract_...tar Usage: nogood.py segid numChanged Note the slightly counter-intuitive value: 0 if the input is no good''' import sys,re shortPat='[1-9][0-9]*%s[1-9][0-9]*' SHORT_ADD=re.compile(shortPat%'a') SHORT_CHANGE=re.compile(shortPat%'c') longPat=shortPat+',[1-9][0-9]*' LONG_ADD=re.compile(longPat%'a') LONG_CHANGE=re.compile(longPat%'c') (segment,n)=map(int,sys.argv[1:]) c=0 for l in sys.stdin: (i,d)=l.rstrip().split(':') if LONG_ADD.fullmatch(d): c+=1 continue elif SHORT_ADD.fullmatch(d): # as in 1566027315865.44/40_diff.txt try: (j,e)=sys.stdin.readline().rstrip().split(':') except ValueError: # EOF, I think # OK, maybe c+=1 continue if int(j)==int(i)+2 and LONG_CHANGE.fullmatch(e): c+=1 continue elif SHORT_CHANGE.fullmatch(d): # as in 1566027315865.44/40_diff.txt try: (j,e)=sys.stdin.readline().rstrip().split(':') except ValueError: # EOF, I think # OK, maybe c+=1 continue if int(j)==int(i)+4 and LONG_ADD.fullmatch(e): c+=1 continue print("can't handle diff at %s: %s for segment %s"%(i,d,segment), file=sys.stderr) exit(0) if c!=n: print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment), file=sys.stderr) exit(0) exit(1)