annotate bin/nogood.py @ 55:50556ac15e88

one-off to convert big extracts.tar into lots of smaller ones
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 14 Apr 2020 16:10:22 +0100
parents 396d1f17c671
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
52
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/lustre/sw/miniconda3/bin/python3
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Detect bad diffs between warc.sh log file and existing extract_...tar
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 Usage: nogood.py segid numChanged
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 Note the slightly counter-intuitive value: 0 if the input is no good'''
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import sys,re
53
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
6 shortPat='[1-9][0-9]*%s[1-9][0-9]*'
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
7 SHORT_ADD=re.compile(shortPat%'a')
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
8 SHORT_CHANGE=re.compile(shortPat%'c')
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
9 longPat=shortPat+',[1-9][0-9]*'
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
10 LONG_ADD=re.compile(longPat%'a')
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
11 LONG_CHANGE=re.compile(longPat%'c')
52
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 (segment,n)=map(int,sys.argv[1:])
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 c=0
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 for l in sys.stdin:
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 (i,d)=l.rstrip().split(':')
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 if LONG_ADD.fullmatch(d):
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 c+=1
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 continue
53
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
19 elif SHORT_ADD.fullmatch(d):
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
20 # as in 1566027315865.44/40_diff.txt
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
21 try:
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
22 (j,e)=sys.stdin.readline().rstrip().split(':')
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
23 except ValueError:
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
24 # EOF, I think
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
25 # OK, maybe
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
26 c+=1
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
27 continue
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
28 if int(j)==int(i)+2 and LONG_CHANGE.fullmatch(e):
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
29 c+=1
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
30 continue
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
31 elif SHORT_CHANGE.fullmatch(d):
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
32 # as in 1566027315865.44/40_diff.txt
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
33 try:
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
34 (j,e)=sys.stdin.readline().rstrip().split(':')
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
35 except ValueError:
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
36 # EOF, I think
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
37 # OK, maybe
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
38 c+=1
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
39 continue
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
40 if int(j)==int(i)+4 and LONG_ADD.fullmatch(e):
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
41 c+=1
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
42 continue
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
43 print("can't handle diff at %s: %s for segment %s"%(i,d,segment),
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
44 file=sys.stderr)
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
45 exit(0)
52
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 if c!=n:
53
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
47 print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment),
396d1f17c671 ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 52
diff changeset
48 file=sys.stderr)
52
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 exit(0)
9cd9daf75183 working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 exit(1)