annotate bin/merge_date.py @ 109:52c6a9b0fc8c

loosen must-match criterion in the both-messy case
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:29:41 +0100
parents 40c460fed99f
children 4a52585a1aac
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 that year's index
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Usage: merge_date.py ksvstream cdx-dir outdir
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 ksvstream consists of tab-separated key, CC date and Unix timestamp
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 ''' # '
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
10 import sys, io, os, os.path, time, re
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 from isal import igzip
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
13
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
14 DEBUG = 0
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
15 while sys.argv[1] == '-d':
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
16 sys.argv.pop(1)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
17 DEBUG += 1
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
18
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
21
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
23 b'(crawldiagnostics|robotstxt)/')
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
26 b'=[^&]*)')
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
28 URL=re.compile(b'\{"url": "([^"]*)"')
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
29
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
30 # Above based on this from broken Java code:
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
34 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
35 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
36 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
38 #print(sys.argv[3],NPATH,file=sys.stderr)
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 os.makedirs(sys.argv[3], exist_ok=True)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
42 FN = 0
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
43
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
44 XCNT = 0
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
45 DCNT = 0
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
46
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
47 XF = igzip.IGzipFile(filename=XPATH%0)
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
48 NF = open(NN:=(NPATH%0),'wb')
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
50 def nextLine(xq, messyD):
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
51 '''Move on to next index file if current has run out'''
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
52 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
53 if xq and not messyD:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
54 return xq.pop(0), xq
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
55 while True:
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
56 xl=XF.readline()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
57 XCNT += 1
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
58 if xl == b'':
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
59 # need to move to next index file
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
60 FN += 1
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
61 XF.close()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
62 NF.close()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
63 print(NN, flush=True) # so we can compress it
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
64 time.sleep(0.1) # so it flushes?
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
65 XN=XPATH%FN
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
66 if not os.path.exists(XN):
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
67 return (None, None)
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
68 XF = igzip.IGzipFile(filename=XN)
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
69 NF = open((NN:=NPATH%FN), 'wb')
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
70 xl = XF.readline()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
71 XCNT = 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
72 return xl, xq
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
73
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
74 def keys(key):
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
75 '''Deal with failure of 2019-35-vintage Java fixup to detect
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
76 parameter-part-initial session ids'''
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
77 if m:=SESSION.match(key):
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
78 prefix=m[1]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
79 e, b = m.span(2)
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
80 fixed=key[:e]+key[b:]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
81 if fixed==m[1]:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
82 return True, prefix[:-1], None
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
83 else:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
84 return True, prefix, fixed
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
85 else:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
86 return False, key, None
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87
92
e56a7aad9ce9 attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 91
diff changeset
88 dfq = [] # for reordering if needed
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
89 messyD = False
92
e56a7aad9ce9 attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 91
diff changeset
90
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
91 with open(sys.argv[1], 'rb') as df:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
92 dl = df.readline()
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
93 DCNT = 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
94 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
95 sys.stderr.write("dl1: %s"%dl.decode('ascii'))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
96 dkey, ddate, durl, dtime = dl.split(b'\t')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
97 messyD = ISESSION.search(durl)
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
98
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
99 xq = []
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
101 while (nlRes := nextLine(xq, messyD))[0] is not None:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
102 (xl, xq) = nlRes
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
103 xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
104 m = URL.match(xprops)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
105 if m:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
106 xurl = m[1]
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
107 else:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
108 raise ValueError("No url in %s"%xprops)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
109 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
110 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
111 for xp in (xkey, xdate, xurl))))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
112 messyU, xkey1, xkey2 = keys(xkey)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
113 if messyD:
109
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
114 noMatch = (not dkey.startswith(xkey1) or
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
115 (xkey2 is not None and dkey!=xkey2))
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
116 if messyU:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
117 # better match
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
118 if noMatch:
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
119 raise ValueError("Fail1: md: %s mu: %s\n"
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
120 " xkey: %s\n"
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
121 " dkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
122 " xdate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
123 " ddate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
124 " xurl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
125 " durl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
126 "dfq: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
127 "k1, k2: |%s|%s|\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
128 "FN: %s XCNT: %s DCNT: %s\n"
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
129 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
130 (b'\n '.join(dfq)).decode('ascii'),
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
131 xkey1, xkey2, FN, XCNT, DCNT, xl))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
132 # fall through to the ordinary (non-messy) match case
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
133 else:
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
134 # still looking, save if >= date else fall through to write
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
135 if DEBUG>1:
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
136 print("Diso: match: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
137 " xkey: %s\n"
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
138 " dkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
139 " xdate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
140 " ddate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
141 " xurl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
142 " durl: %s\n"
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
143 "xl: %s"%(not noMatch,
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
144 xkey, dkey, xdate, ddate, xurl, durl, xl),
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
145 file=sys.stderr)
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
146 if (dkey.startswith(xkey1) and
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
147 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
148 xq.append(xl)
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
149 if DEBUG>1:
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
150 sys.stderr.write('xpush\n')
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
151 continue
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
152 # else fall through
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
153 if (ddate != xdate or
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
154 not dkey.startswith(xkey1) or
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
155 (xkey2 is not None and dkey!=xkey2) or
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
156 durl!=xurl):
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
157 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
158
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
159 raise ValueError("Fail2: md: %s mu: %s\n"
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
160 " xkey: %s\n"
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
161 " dkey: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
162 " xdate: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
163 " ddate: %s\n"
109
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
164 " xurl: %s\n"
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
165 " durl: %s\n"
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
166 "dfq: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
167 "k1, k2: |%s|%s|\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
168 "FN: %s XCNT: %s DCNT: %s\n"
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
169 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
109
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
170 xurl, durl,
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
171 (b'\n '.join(dfq)).decode('ascii'),
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
172 xkey1, xkey2, FN, XCNT, DCNT, xl))
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
173 NF.write(xl)
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
174 if DEBUG>1:
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
175 sys.stderr.write("out_nl\n")
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
176 continue
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
177 # Got it
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
178 NF.write(xkey)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
179 NF.write(b' ')
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
180 NF.write(xdate)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
181 NF.write(b' ')
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
182 NF.write(xprops[:-2])
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
183 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
184 if DEBUG>1:
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
185 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
186 for xp in (xkey, xdate, xurl))))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
187 sys.stderr.write(" %d\n"%int(dtime[:-3]))
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
188 dl = df.readline()
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
189 if dl == '':
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
190 if dfq:
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
191 if DEBUG:
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
192 raise ValueError
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
193 # write out the last of the last index file, if any
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
194 dkey = ddate = durl = ""
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
195 else:
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
196 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
197 sys.stderr.write("dl3: %s"%dl.decode('ascii'))
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
198 DCNT += 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
199 dkey, ddate, durl, dtime = dl.split(b'\t')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
200 messyD = ISESSION.search(durl)