Mercurial > hg > cc > cirrus_work
annotate bin/merge_date.py @ 101:e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 11 Sep 2023 22:06:45 +0100 |
parents | 18446a7eeb9e |
children | e606c609f813 |
rev | line source |
---|---|
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 that year's index |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 Usage: merge_date.py ksvstream cdx-dir outdir |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 ksvstream consists of tab-separated key, CC date and Unix timestamp |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 ''' # ' |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
10 import sys, io, os, os.path, time, re |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from isal import igzip |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
13 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
14 DEBUG = 0 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
15 while sys.argv[1] == '-d': |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
16 sys.argv.pop(1) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
17 DEBUG += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
18 |
90 | 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
21 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
23 b'(crawldiagnostics|robotstxt)/') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
26 b'=[^&]*)') |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
28 URL=re.compile(b'\{"url": "([^"]*)"') |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
29 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
30 # Above based on this from fixed Java code: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
31 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
32 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
33 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
34 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
35 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 |
90 | 37 #print(sys.argv[3],NPATH,file=sys.stderr) |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 os.makedirs(sys.argv[3], exist_ok=True) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 |
90 | 41 FN = 0 |
42 | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
43 XCNT = 0 |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
44 DCNT = 0 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
45 |
90 | 46 XF = igzip.IGzipFile(filename=XPATH%0) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
47 NF = open(NN:=(NPATH%0),'wb') |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
49 def nextLine(xq, messyD): |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
50 '''Move on to next index file if current has run out''' |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
51 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
52 if xq and not messyD: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
53 return xq.pop(0), xq |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
54 while True: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
55 xl=XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
56 XCNT += 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
57 if xl == b'': |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
58 # need to move to next index file |
90 | 59 FN += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
60 XF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
61 NF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
62 print(NN, flush=True) # so we can compress it |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
63 time.sleep(0.1) # so it flushes? |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
64 XN=XPATH%FN |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
65 if not os.path.exists(XN): |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
66 return (None, None) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
67 XF = igzip.IGzipFile(filename=XN) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
68 NF = open((NN:=NPATH%FN), 'wb') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
69 xl = XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
70 XCNT = 1 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
71 return xl, xq |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
72 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
73 def keys(key): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
74 '''Deal with failure of 2019-35-vintage Java fixup to detect |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
75 parameter-part-initial session ids''' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
76 if m:=SESSION.match(key): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
77 prefix=m[1] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
78 e, b = m.span(2) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
79 fixed=key[:e]+key[b:] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
80 if fixed==m[1]: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
81 return True, prefix[:-1], None |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
82 else: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
83 return True, prefix, fixed |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
84 else: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
85 return False, key, None |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 |
92
e56a7aad9ce9
attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
87 dfq = [] # for reordering if needed |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
88 messyD = False |
92
e56a7aad9ce9
attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
89 |
90 | 90 with open(sys.argv[1], 'rb') as df: |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
91 dl = df.readline() |
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
92 DCNT = 1 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
93 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
94 sys.stderr.write("dl1: %s"%dl.decode('ascii')) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
95 dkey, ddate, durl, dtime = dl.split(b'\t') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
96 messyD = ISESSION.search(durl) |
90 | 97 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
98 xq = [] |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
99 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
100 while (nlRes := nextLine(xq, messyD))[0] is not None: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
101 (xl, xq) = nlRes |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
102 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
103 m = URL.match(xprops) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
104 if m: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
105 xurl = m[1] |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
106 else: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
107 raise ValueError("No url in %s"%xprops) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
108 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
109 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
110 for xp in (xkey, xdate, xurl)))) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
111 messyU, xkey1, xkey2 = keys(xkey) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
112 if messyD: |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
113 noMatch = (ddate != xdate or |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
114 not dkey.startswith(xkey1) or |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
115 dkey!=xkey1 or |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
116 durl!=xurl) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
117 if messyU: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
118 # better match |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
119 if noMatch: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
120 raise ValueError("Fail: xkey: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
121 " dkey: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
122 " xdate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
123 " ddate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
124 " xurl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
125 " durl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
126 "dfq: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
127 "k1, k2: |%s|%s|\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
128 "FN: %s XCNT: %s DCNT: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
129 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
130 (b'\n '.join(dfq)).decode('ascii'), |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
131 xkey1, xkey2, FN, XCNT, DCNT, xl)) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
132 # fall through to the ordinary (non-messy) match case |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
133 else: |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
134 # still looking, save if >= date else fall through to write |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
135 if DEBUG>1: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
136 print("Diso: xkey: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
137 " dkey: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
138 " xdate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
139 " ddate: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
140 " xurl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
141 " durl: %s\n" |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
142 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl), |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
143 file=sys.stderr) |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
144 if not noMatch: |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
145 xq.append(xl) |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
146 if DEBUG>1: |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
147 sys.stderr.write('xpush\n') |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
148 continue |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
149 # else fall through |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
150 else: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
151 # Not messyD |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
152 if messyU: |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
153 raise ValueError("messyU w/o messyD:" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
154 "xkey: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
155 "dkey: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
156 "xdate: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
157 "ddate: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
158 "xurl: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
159 "durl: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
160 "dfq: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
161 "k1, k2: |%s|%s|\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
162 "FN: %s XCNT: %s DCNT: %s\n" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
163 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
164 (b'\n '.join(dfq)).decode('ascii'), |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
100
diff
changeset
|
165 xkey1, xkey2, FN, XCNT, DCNT, xl)) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
166 if (ddate != xdate or |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
167 not dkey.startswith(xkey1) or |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
168 (xkey2 is not None and dkey!=xkey2) or |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
169 durl!=xurl): |
90 | 170 NF.write(xl) |
171 continue | |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
172 # Got it |
90 | 173 NF.write(xkey) |
174 NF.write(b' ') | |
175 NF.write(xdate) | |
176 NF.write(b' ') | |
177 NF.write(xprops[:-2]) | |
178 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
179 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
180 sys.stderr.write("out: %s"%(' '.join(xp.decode('ascii') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
181 for xp in (xkey, xdate, xurl)))) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
182 sys.stderr.write(" %d\n"%int(dtime[:-3])) |
90 | 183 dl = df.readline() |
184 if dl == '': | |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
185 if dfq: |
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
186 if DEBUG: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
187 raise ValueError |
90 | 188 # write out the last of the last index file, if any |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
189 dkey = ddate = durl = "" |
90 | 190 else: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
191 if DEBUG>1: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
192 sys.stderr.write("dl3: %s"%dl.decode('ascii')) |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
193 DCNT += 1 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
194 dkey, ddate, durl, dtime = dl.split(b'\t') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
195 messyD = ISESSION.search(durl) |