annotate bin/merge_date.py @ 101:e2e64c3d763e

bug4 fixed, but that created a new, earlier bug
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 11 Sep 2023 22:06:45 +0100
parents 18446a7eeb9e
children e606c609f813
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 that year's index
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Usage: merge_date.py ksvstream cdx-dir outdir
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 ksvstream consists of tab-separated key, CC date and Unix timestamp
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 ''' # '
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
10 import sys, io, os, os.path, time, re
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 from isal import igzip
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
13
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
14 DEBUG = 0
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
15 while sys.argv[1] == '-d':
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
16 sys.argv.pop(1)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
17 DEBUG += 1
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
18
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
21
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
23 b'(crawldiagnostics|robotstxt)/')
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
26 b'=[^&]*)')
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
28 URL=re.compile(b'\{"url": "([^"]*)"')
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
29
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
30 # Above based on this from fixed Java code:
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
31 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
32 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
33 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
34 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
35 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
37 #print(sys.argv[3],NPATH,file=sys.stderr)
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 os.makedirs(sys.argv[3], exist_ok=True)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
41 FN = 0
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
42
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
43 XCNT = 0
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
44 DCNT = 0
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
45
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
46 XF = igzip.IGzipFile(filename=XPATH%0)
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
47 NF = open(NN:=(NPATH%0),'wb')
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
49 def nextLine(xq, messyD):
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
50 '''Move on to next index file if current has run out'''
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
51 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
52 if xq and not messyD:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
53 return xq.pop(0), xq
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
54 while True:
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
55 xl=XF.readline()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
56 XCNT += 1
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
57 if xl == b'':
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
58 # need to move to next index file
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
59 FN += 1
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
60 XF.close()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
61 NF.close()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
62 print(NN, flush=True) # so we can compress it
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
63 time.sleep(0.1) # so it flushes?
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
64 XN=XPATH%FN
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
65 if not os.path.exists(XN):
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
66 return (None, None)
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
67 XF = igzip.IGzipFile(filename=XN)
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
68 NF = open((NN:=NPATH%FN), 'wb')
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
69 xl = XF.readline()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
70 XCNT = 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
71 return xl, xq
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
72
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
73 def keys(key):
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
74 '''Deal with failure of 2019-35-vintage Java fixup to detect
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
75 parameter-part-initial session ids'''
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
76 if m:=SESSION.match(key):
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
77 prefix=m[1]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
78 e, b = m.span(2)
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
79 fixed=key[:e]+key[b:]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
80 if fixed==m[1]:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
81 return True, prefix[:-1], None
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
82 else:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
83 return True, prefix, fixed
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
84 else:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
85 return False, key, None
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
86
92
e56a7aad9ce9 attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 91
diff changeset
87 dfq = [] # for reordering if needed
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
88 messyD = False
92
e56a7aad9ce9 attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 91
diff changeset
89
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
90 with open(sys.argv[1], 'rb') as df:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
91 dl = df.readline()
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
92 DCNT = 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
93 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
94 sys.stderr.write("dl1: %s"%dl.decode('ascii'))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
95 dkey, ddate, durl, dtime = dl.split(b'\t')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
96 messyD = ISESSION.search(durl)
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
97
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
98 xq = []
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
99
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
100 while (nlRes := nextLine(xq, messyD))[0] is not None:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
101 (xl, xq) = nlRes
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
102 xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
103 m = URL.match(xprops)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
104 if m:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
105 xurl = m[1]
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
106 else:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
107 raise ValueError("No url in %s"%xprops)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
108 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
109 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
110 for xp in (xkey, xdate, xurl))))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
111 messyU, xkey1, xkey2 = keys(xkey)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
112 if messyD:
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
113 noMatch = (ddate != xdate or
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
114 not dkey.startswith(xkey1) or
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
115 dkey!=xkey1 or
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
116 durl!=xurl)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
117 if messyU:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
118 # better match
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
119 if noMatch:
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
120 raise ValueError("Fail: xkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
121 " dkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
122 " xdate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
123 " ddate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
124 " xurl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
125 " durl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
126 "dfq: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
127 "k1, k2: |%s|%s|\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
128 "FN: %s XCNT: %s DCNT: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
129 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
130 (b'\n '.join(dfq)).decode('ascii'),
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
131 xkey1, xkey2, FN, XCNT, DCNT, xl))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
132 # fall through to the ordinary (non-messy) match case
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
133 else:
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
134 # still looking, save if >= date else fall through to write
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
135 if DEBUG>1:
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
136 print("Diso: xkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
137 " dkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
138 " xdate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
139 " ddate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
140 " xurl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
141 " durl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
142 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl),
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
143 file=sys.stderr)
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
144 if not noMatch:
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
145 xq.append(xl)
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
146 if DEBUG>1:
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
147 sys.stderr.write('xpush\n')
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
148 continue
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
149 # else fall through
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
150 else:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
151 # Not messyD
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
152 if messyU:
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
153 raise ValueError("messyU w/o messyD:"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
154 "xkey: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
155 "dkey: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
156 "xdate: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
157 "ddate: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
158 "xurl: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
159 "durl: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
160 "dfq: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
161 "k1, k2: |%s|%s|\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
162 "FN: %s XCNT: %s DCNT: %s\n"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
163 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
164 (b'\n '.join(dfq)).decode('ascii'),
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
165 xkey1, xkey2, FN, XCNT, DCNT, xl))
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
166 if (ddate != xdate or
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
167 not dkey.startswith(xkey1) or
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
168 (xkey2 is not None and dkey!=xkey2) or
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
169 durl!=xurl):
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
170 NF.write(xl)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
171 continue
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
172 # Got it
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
173 NF.write(xkey)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
174 NF.write(b' ')
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
175 NF.write(xdate)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
176 NF.write(b' ')
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
177 NF.write(xprops[:-2])
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
178 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
179 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
180 sys.stderr.write("out: %s"%(' '.join(xp.decode('ascii')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
181 for xp in (xkey, xdate, xurl))))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
182 sys.stderr.write(" %d\n"%int(dtime[:-3]))
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
183 dl = df.readline()
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
184 if dl == '':
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
185 if dfq:
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
186 if DEBUG:
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
187 raise ValueError
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
188 # write out the last of the last index file, if any
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
189 dkey = ddate = durl = ""
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
190 else:
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
191 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
192 sys.stderr.write("dl3: %s"%dl.decode('ascii'))
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
193 DCNT += 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
194 dkey, ddate, durl, dtime = dl.split(b'\t')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
195 messyD = ISESSION.search(durl)