Mercurial > hg > cc > cirrus_home
annotate bin/nogood.py @ 55:50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 14 Apr 2020 16:10:22 +0100 |
parents | 396d1f17c671 |
children |
rev | line source |
---|---|
52
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/lustre/sw/miniconda3/bin/python3 |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Detect bad diffs between warc.sh log file and existing extract_...tar |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 Usage: nogood.py segid numChanged |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 Note the slightly counter-intuitive value: 0 if the input is no good''' |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 import sys,re |
53
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
6 shortPat='[1-9][0-9]*%s[1-9][0-9]*' |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
7 SHORT_ADD=re.compile(shortPat%'a') |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
8 SHORT_CHANGE=re.compile(shortPat%'c') |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
9 longPat=shortPat+',[1-9][0-9]*' |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
10 LONG_ADD=re.compile(longPat%'a') |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
11 LONG_CHANGE=re.compile(longPat%'c') |
52
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 (segment,n)=map(int,sys.argv[1:]) |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 c=0 |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 for l in sys.stdin: |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 (i,d)=l.rstrip().split(':') |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 if LONG_ADD.fullmatch(d): |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 c+=1 |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 continue |
53
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
19 elif SHORT_ADD.fullmatch(d): |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
20 # as in 1566027315865.44/40_diff.txt |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
21 try: |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
22 (j,e)=sys.stdin.readline().rstrip().split(':') |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
23 except ValueError: |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
24 # EOF, I think |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
25 # OK, maybe |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
26 c+=1 |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
27 continue |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
28 if int(j)==int(i)+2 and LONG_CHANGE.fullmatch(e): |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
29 c+=1 |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
30 continue |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
31 elif SHORT_CHANGE.fullmatch(d): |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
32 # as in 1566027315865.44/40_diff.txt |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
33 try: |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
34 (j,e)=sys.stdin.readline().rstrip().split(':') |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
35 except ValueError: |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
36 # EOF, I think |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
37 # OK, maybe |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
38 c+=1 |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
39 continue |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
40 if int(j)==int(i)+4 and LONG_ADD.fullmatch(e): |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
41 c+=1 |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
42 continue |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
43 print("can't handle diff at %s: %s for segment %s"%(i,d,segment), |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
44 file=sys.stderr) |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
45 exit(0) |
52
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 if c!=n: |
53
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
47 print("too many diffs? (%s<>%s) for segment %s"%(c,n,segment), |
396d1f17c671
ready to try another pass with robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
52
diff
changeset
|
48 file=sys.stderr) |
52
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 exit(0) |
9cd9daf75183
working towards more robust diff checking
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 exit(1) |