Mercurial > hg > cc > cirrus_work
annotate bin/merge_date.py @ 109:52c6a9b0fc8c
loosen must-match criterion in the both-messy case
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 19 Sep 2023 19:29:41 +0100 |
parents | 40c460fed99f |
children | 4a52585a1aac |
rev | line source |
---|---|
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 that year's index |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 Usage: merge_date.py ksvstream cdx-dir outdir |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 ksvstream consists of tab-separated key, CC date and Unix timestamp |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 ''' # ' |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
10 import sys, io, os, os.path, time, re |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from isal import igzip |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
13 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
14 DEBUG = 0 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
15 while sys.argv[1] == '-d': |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
16 sys.argv.pop(1) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
17 DEBUG += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
18 |
90 | 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
21 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
23 b'(crawldiagnostics|robotstxt)/') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
26 b'=[^&]*)') |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
28 URL=re.compile(b'\{"url": "([^"]*)"') |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
29 |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
30 # Above based on this from broken Java code: |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
34 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
35 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
36 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 |
90 | 38 #print(sys.argv[3],NPATH,file=sys.stderr) |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 os.makedirs(sys.argv[3], exist_ok=True) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 |
90 | 42 FN = 0 |
43 | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
44 XCNT = 0 |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
45 DCNT = 0 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
46 |
90 | 47 XF = igzip.IGzipFile(filename=XPATH%0) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
48 NF = open(NN:=(NPATH%0),'wb') |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
50 def nextLine(xq, messyD): |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
51 '''Move on to next index file if current has run out''' |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
52 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
53 if xq and not messyD: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
54 return xq.pop(0), xq |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
55 while True: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
56 xl=XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
57 XCNT += 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
58 if xl == b'': |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
59 # need to move to next index file |
90 | 60 FN += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
61 XF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
62 NF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
63 print(NN, flush=True) # so we can compress it |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
64 time.sleep(0.1) # so it flushes? |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
65 XN=XPATH%FN |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
66 if not os.path.exists(XN): |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
67 return (None, None) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
68 XF = igzip.IGzipFile(filename=XN) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
69 NF = open((NN:=NPATH%FN), 'wb') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
70 xl = XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
71 XCNT = 1 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
72 return xl, xq |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
73 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
74 def keys(key): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
75 '''Deal with failure of 2019-35-vintage Java fixup to detect |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
76 parameter-part-initial session ids''' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
77 if m:=SESSION.match(key): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
78 prefix=m[1] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
79 e, b = m.span(2) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
80 fixed=key[:e]+key[b:] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
81 if fixed==m[1]: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
82 return True, prefix[:-1], None |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
83 else: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
84 return True, prefix, fixed |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
85 else: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
86 return False, key, None |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 |
92
e56a7aad9ce9
attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
88 dfq = [] # for reordering if needed |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
89 messyD = False |
92
e56a7aad9ce9
attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
90 |
90 | 91 with open(sys.argv[1], 'rb') as df: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
92 dl = df.readline() |
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
93 DCNT = 1 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
94 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
95 sys.stderr.write("dl1: %s"%dl.decode('ascii')) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
96 dkey, ddate, durl, dtime = dl.split(b'\t') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
97 messyD = ISESSION.search(durl) |
90 | 98 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
99 xq = [] |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
100 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
101 while (nlRes := nextLine(xq, messyD))[0] is not None: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
102 (xl, xq) = nlRes |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
103 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
104 m = URL.match(xprops) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
105 if m: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
106 xurl = m[1] |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
107 else: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
108 raise ValueError("No url in %s"%xprops) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
109 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
110 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
111 for xp in (xkey, xdate, xurl)))) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
112 messyU, xkey1, xkey2 = keys(xkey) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
113 if messyD: |
109
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
114 noMatch = (not dkey.startswith(xkey1) or |
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
115 (xkey2 is not None and dkey!=xkey2)) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
116 if messyU: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
117 # better match |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
118 if noMatch: |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
119 raise ValueError("Fail1: md: %s mu: %s\n" |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
120 " xkey: %s\n" |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
121 " dkey: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
122 " xdate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
123 " ddate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
124 " xurl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
125 " durl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
126 "dfq: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
127 "k1, k2: |%s|%s|\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
128 "FN: %s XCNT: %s DCNT: %s\n" |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
129 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
130 (b'\n '.join(dfq)).decode('ascii'), |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
131 xkey1, xkey2, FN, XCNT, DCNT, xl)) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
132 # fall through to the ordinary (non-messy) match case |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
133 else: |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
134 # still looking, save if >= date else fall through to write |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
135 if DEBUG>1: |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
136 print("Diso: match: %s\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
137 " xkey: %s\n" |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
138 " dkey: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
139 " xdate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
140 " ddate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
141 " xurl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
142 " durl: %s\n" |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
143 "xl: %s"%(not noMatch, |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
144 xkey, dkey, xdate, ddate, xurl, durl, xl), |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
145 file=sys.stderr) |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
146 if (dkey.startswith(xkey1) and |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
147 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
148 xq.append(xl) |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
149 if DEBUG>1: |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
150 sys.stderr.write('xpush\n') |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
151 continue |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
152 # else fall through |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
153 if (ddate != xdate or |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
154 not dkey.startswith(xkey1) or |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
155 (xkey2 is not None and dkey!=xkey2) or |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
156 durl!=xurl): |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
157 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
158 |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
159 raise ValueError("Fail2: md: %s mu: %s\n" |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
160 " xkey: %s\n" |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
161 " dkey: %s\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
162 " xdate: %s\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
163 " ddate: %s\n" |
109
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
164 " xurl: %s\n" |
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
165 " durl: %s\n" |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
166 "dfq: %s\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
167 "k1, k2: |%s|%s|\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
168 "FN: %s XCNT: %s DCNT: %s\n" |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
169 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, |
109
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
170 xurl, durl, |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
171 (b'\n '.join(dfq)).decode('ascii'), |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
172 xkey1, xkey2, FN, XCNT, DCNT, xl)) |
90 | 173 NF.write(xl) |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
174 if DEBUG>1: |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
175 sys.stderr.write("out_nl\n") |
90 | 176 continue |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
177 # Got it |
90 | 178 NF.write(xkey) |
179 NF.write(b' ') | |
180 NF.write(xdate) | |
181 NF.write(b' ') | |
182 NF.write(xprops[:-2]) | |
183 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
184 if DEBUG>1: |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
185 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
186 for xp in (xkey, xdate, xurl)))) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
187 sys.stderr.write(" %d\n"%int(dtime[:-3])) |
90 | 188 dl = df.readline() |
189 if dl == '': | |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
190 if dfq: |
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
191 if DEBUG: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
192 raise ValueError |
90 | 193 # write out the last of the last index file, if any |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
194 dkey = ddate = durl = "" |
90 | 195 else: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
196 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
197 sys.stderr.write("dl3: %s"%dl.decode('ascii')) |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
198 DCNT += 1 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
199 dkey, ddate, durl, dtime = dl.split(b'\t') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
200 messyD = ISESSION.search(durl) |