annotate bin/merge_date.py @ 113:4a52585a1aac

refactor datestream reading, fix pattern ordering in SESSION
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 26 Sep 2023 09:03:47 +0100
parents 52c6a9b0fc8c
children 0b1e6e134aca
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 that year's index
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Usage: merge_date.py ksvstream cdx-dir outdir
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
7 ksvstream consists of tab-separated key, CC date, url and Unix timestamp
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 ''' # '
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
10 import sys, io, os, os.path, time, re
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 from isal import igzip
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
13
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
14 DEBUG = 0
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
15 while sys.argv[1] == '-d':
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
16 sys.argv.pop(1)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
17 DEBUG += 1
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
18
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
21
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
23 b'(crawldiagnostics|robotstxt)/')
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
26 b'=[^&]*)')
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
28 URL=re.compile(b'\{"url": "([^"]*)"')
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
29
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
30 # Above based on this from broken Java code:
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
34 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
35 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
36 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
38 #print(sys.argv[3],NPATH,file=sys.stderr)
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 os.makedirs(sys.argv[3], exist_ok=True)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
42 FN = 0
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
43
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
44 XCNT = 0
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
45 DCNT = 0
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
46
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
47 XF = igzip.IGzipFile(filename=XPATH%0)
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
48 NF = open(NN:=(NPATH%0),'wb')
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
50 def nextLine(xq, messyD):
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
51 '''Move on to next index file if current has run out'''
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
52 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
53 if xq and not messyD:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
54 return xq.pop(0), xq
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
55 while True:
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
56 xl=XF.readline()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
57 XCNT += 1
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
58 if xl == b'':
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
59 # need to move to next index file
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
60 FN += 1
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
61 XF.close()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
62 NF.close()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
63 print(NN, flush=True) # so we can compress it
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
64 time.sleep(0.1) # so it flushes?
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
65 XN=XPATH%FN
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
66 if not os.path.exists(XN):
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
67 return (None, None)
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
68 XF = igzip.IGzipFile(filename=XN)
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
69 NF = open((NN:=NPATH%FN), 'wb')
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
70 xl = XF.readline()
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
71 XCNT = 1
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
72 return xl, xq
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
73
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
74 def keys(key):
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
75 '''Deal with failure of 2019-35-vintage Java fixup to detect
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
76 parameter-part-initial session ids'''
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
77 if m:=SESSION.match(key):
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
78 prefix=m[1]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
79 e, b = m.span(2)
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
80 fixed=key[:e]+key[b:]
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
81 if fixed==m[1]:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
82 return True, prefix[:-1], None
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
83 else:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
84 return True, prefix, fixed
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
85 else:
93
25bd398a8035 improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 92
diff changeset
86 return False, key, None
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
88 DFQ = [] # for reordering if needed
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
89 messyD = False
92
e56a7aad9ce9 attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 91
diff changeset
90
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
91 def nextDate(df,dn):
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
92 global DEBUG, DFQ, DCNT, ISESSION
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
93 dl = df.readline()
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
94 if dl == b'':
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
95 if DFQ:
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
96 if DEBUG:
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
97 raise ValueError("EOF but non-empty DFQ: %s"%DFQ)
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
98 # write out the last of the last index file, if any
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
99 return "", "", "", 0, False
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
100 if DEBUG>1:
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
101 sys.stderr.write("dl%s: %s\n"%(dn,dl))
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
102 dkey, ddate, durl, dtime = dl.split(b'\t')
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
103 messyD = ISESSION.search(durl)
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
104 DCNT += 1
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
105 return dkey, ddate, durl, dtime, messyD
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
106
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
107 with open(sys.argv[1], 'rb') as df:
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
108 DCNT = 0
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
109 dkey, ddate, durl, dtime, messyD = nextDate(df,1)
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
110
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
111 xq = []
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
112
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
113 while (nlRes := nextLine(xq, messyD))[0] is not None:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
114 (xl, xq) = nlRes
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
115 xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
116 m = URL.match(xprops)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
117 if m:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
118 xurl = m[1]
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
119 else:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
120 raise ValueError("No url in %s"%xprops)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
121 if DEBUG>1:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
122 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
123 for xp in (xkey, xdate, xurl))))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
124 messyU, xkey1, xkey2 = keys(xkey)
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
125 if messyD:
109
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
126 noMatch = (not dkey.startswith(xkey1) or
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
127 (xkey2 is not None and dkey!=xkey2))
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
128 if messyU:
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
129 # better match
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
130 if noMatch:
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
131 print("Fail1: md: %s mu: %s\n"
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
132 " xkey: %s\n"
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
133 " dkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
134 " xdate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
135 " ddate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
136 " xurl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
137 " durl: %s\n"
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
138 "DFQ: %s\n"
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
139 "k1, k2: |%s|%s|\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
140 "FN: %s XCNT: %s DCNT: %s\n"
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
141 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
142 (b'\n '.join(DFQ)).decode('ascii'),
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
143 xkey1, xkey2, FN, XCNT, DCNT, xl),
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
144 file=sys.stderr)
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
145 # fall through to the ordinary (non-messy) match case
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
146 else:
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
147 # still looking, save if >= date else fall through to write
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
148 if DEBUG>1:
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
149 print("Diso: match: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
150 " xkey: %s\n"
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
151 " dkey: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
152 " xdate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
153 " ddate: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
154 " xurl: %s\n"
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
155 " durl: %s\n"
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
156 "xl: %s"%(not noMatch,
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
157 xkey, dkey, xdate, ddate, xurl, durl, xl),
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
158 file=sys.stderr)
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
159 if (dkey.startswith(xkey1) and
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
160 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
161 xq.append(xl)
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
162 if DEBUG>1:
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
163 sys.stderr.write('xpush\n')
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
164 continue
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 100
diff changeset
165 # else fall through
91
460f0599e8cd mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
166 if (ddate != xdate or
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
167 not dkey.startswith(xkey1) or
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
168 (xkey2 is not None and dkey!=xkey2) or
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
169 durl!=xurl):
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
170 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
171
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
172 print("Fail2: md: %s mu: %s\n"
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
173 " xkey: %s\n"
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
174 " dkey: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
175 " xdate: %s\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
176 " ddate: %s\n"
109
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
177 " xurl: %s\n"
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
178 " durl: %s\n"
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
179 "DFQ: %s\n"
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
180 "k1, k2: |%s|%s|\n"
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
181 "FN: %s XCNT: %s DCNT: %s\n"
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 102
diff changeset
182 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
109
52c6a9b0fc8c loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
183 xurl, durl,
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
184 (b'\n '.join(DFQ)).decode('ascii'),
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
185 xkey1, xkey2, FN, XCNT, DCNT, xl),
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
186 file=sys.stderr)
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
187 # try to force recovery
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
188 dkey, ddate, durl, dtime, messyD = nextDate(df,3)
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
189 NF.write(xl)
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
190 if DEBUG>1:
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
191 sys.stderr.write("out_nl\n")
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
192 continue
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
193 # Got it
90
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
194 NF.write(xkey)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
195 NF.write(b' ')
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
196 NF.write(xdate)
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
197 NF.write(b' ')
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
198 NF.write(xprops[:-2])
c1a70532444c flip loops
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
199 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
200 if DEBUG>1:
102
e606c609f813 reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 101
diff changeset
201 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
100
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
202 for xp in (xkey, xdate, xurl))))
18446a7eeb9e rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 94
diff changeset
203 sys.stderr.write(" %d\n"%int(dtime[:-3]))
113
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
204
4a52585a1aac refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 109
diff changeset
205 dkey, ddate, durl, dtime, messyD = nextDate(df,2)