Mercurial > hg > cc > cirrus_work
comparison bin/merge_date.py @ 115:0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
get rid of DFQ and xq,
big simplification and refactor as a result,
fix bug in date stream eof handling
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 26 Sep 2023 17:42:57 +0100 |
parents | 4a52585a1aac |
children | f52783faf3ee |
comparison
equal
deleted
inserted
replaced
114:5818d79c4ec9 | 115:0b1e6e134aca |
---|---|
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' | 24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' | 25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' |
26 b'=[^&]*)') | 26 b'=[^&]*)') |
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) | 27 ISESSION = re.compile(SESSION.pattern,flags=re.I) |
28 URL=re.compile(b'\{"url": "([^"]*)"') | 28 URL=re.compile(b'\{"url": "([^"]*)"') |
29 WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') | |
29 | 30 |
30 # Above based on this from broken Java code: | 31 # Above based on this from broken Java code: |
31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 | 32 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 |
32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), | 33 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), | 34 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
45 DCNT = 0 | 46 DCNT = 0 |
46 | 47 |
47 XF = igzip.IGzipFile(filename=XPATH%0) | 48 XF = igzip.IGzipFile(filename=XPATH%0) |
48 NF = open(NN:=(NPATH%0),'wb') | 49 NF = open(NN:=(NPATH%0),'wb') |
49 | 50 |
50 def nextLine(xq, messyD): | 51 def nextLine(): |
51 '''Move on to next index file if current has run out''' | 52 '''Move on to next index file if current has run out''' |
52 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT | 53 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT |
53 if xq and not messyD: | |
54 return xq.pop(0), xq | |
55 while True: | 54 while True: |
56 xl=XF.readline() | 55 xl=XF.readline() |
57 XCNT += 1 | 56 XCNT += 1 |
58 if xl == b'': | 57 if xl == b'': |
59 # need to move to next index file | 58 # need to move to next index file |
62 NF.close() | 61 NF.close() |
63 print(NN, flush=True) # so we can compress it | 62 print(NN, flush=True) # so we can compress it |
64 time.sleep(0.1) # so it flushes? | 63 time.sleep(0.1) # so it flushes? |
65 XN=XPATH%FN | 64 XN=XPATH%FN |
66 if not os.path.exists(XN): | 65 if not os.path.exists(XN): |
67 return (None, None) | 66 return None |
68 XF = igzip.IGzipFile(filename=XN) | 67 XF = igzip.IGzipFile(filename=XN) |
69 NF = open((NN:=NPATH%FN), 'wb') | 68 NF = open((NN:=NPATH%FN), 'wb') |
70 xl = XF.readline() | 69 xl = XF.readline() |
71 XCNT = 1 | 70 XCNT = 1 |
72 return xl, xq | 71 if WARC.search(xl): |
72 return xl | |
73 else: | |
74 NF.write(xl) | |
75 if DEBUG: | |
76 sys.stderr.write("out_rc\n") | |
73 | 77 |
74 def keys(key): | |
75 '''Deal with failure of 2019-35-vintage Java fixup to detect | |
76 parameter-part-initial session ids''' | |
77 if m:=SESSION.match(key): | |
78 prefix=m[1] | |
79 e, b = m.span(2) | |
80 fixed=key[:e]+key[b:] | |
81 if fixed==m[1]: | |
82 return True, prefix[:-1], None | |
83 else: | |
84 return True, prefix, fixed | |
85 else: | |
86 return False, key, None | |
87 | |
88 DFQ = [] # for reordering if needed | |
89 messyD = False | |
90 | 78 |
91 def nextDate(df,dn): | 79 def nextDate(df,dn): |
92 global DEBUG, DFQ, DCNT, ISESSION | 80 global DEBUG, DCNT, XCNT |
93 dl = df.readline() | 81 dl = df.readline() |
94 if dl == b'': | 82 if dl == b'': |
95 if DFQ: | |
96 if DEBUG: | |
97 raise ValueError("EOF but non-empty DFQ: %s"%DFQ) | |
98 # write out the last of the last index file, if any | 83 # write out the last of the last index file, if any |
99 return "", "", "", 0, False | 84 return "", "", "", 0 |
100 if DEBUG>1: | 85 if DEBUG: |
101 sys.stderr.write("dl%s: %s\n"%(dn,dl)) | 86 sys.stderr.write("dl%s: %s\n"%(dn,dl)) |
102 dkey, ddate, durl, dtime = dl.split(b'\t') | 87 dkey, ddate, durl, dtime = dl.split(b'\t') |
103 messyD = ISESSION.search(durl) | |
104 DCNT += 1 | 88 DCNT += 1 |
105 return dkey, ddate, durl, dtime, messyD | 89 return dkey, ddate, durl, dtime |
106 | 90 |
107 with open(sys.argv[1], 'rb') as df: | 91 with open(sys.argv[1], 'rb') as df: |
108 DCNT = 0 | 92 DCNT = 0 |
109 dkey, ddate, durl, dtime, messyD = nextDate(df,1) | |
110 | 93 |
111 xq = [] | 94 dkey, ddate, durl, dtime = nextDate(df,1) |
112 | 95 |
113 while (nlRes := nextLine(xq, messyD))[0] is not None: | 96 while (xl := nextLine())[0] is not None: |
114 (xl, xq) = nlRes | |
115 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) | 97 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
116 m = URL.match(xprops) | 98 m = URL.match(xprops) |
117 if m: | 99 if m: |
118 xurl = m[1] | 100 xurl = m[1] |
119 else: | 101 else: |
120 raise ValueError("No url in %s"%xprops) | 102 raise ValueError("No url in %s"%xprops) |
121 if DEBUG>1: | 103 if DEBUG: |
122 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') | 104 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') |
123 for xp in (xkey, xdate, xurl)))) | 105 for xp in (xkey, xdate, xurl)))) |
124 messyU, xkey1, xkey2 = keys(xkey) | 106 if dkey==xkey and ddate==xdate and durl==xurl: |
125 if messyD: | 107 # Got it |
126 noMatch = (not dkey.startswith(xkey1) or | 108 NF.write(xkey) |
127 (xkey2 is not None and dkey!=xkey2)) | 109 NF.write(b' ') |
128 if messyU: | 110 NF.write(xdate) |
129 # better match | 111 NF.write(b' ') |
130 if noMatch: | 112 NF.write(xprops[:-2]) |
131 print("Fail1: md: %s mu: %s\n" | 113 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) |
132 " xkey: %s\n" | 114 if DEBUG: |
133 " dkey: %s\n" | 115 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') |
134 " xdate: %s\n" | 116 for xp in (xkey, xdate, xurl)))) |
135 " ddate: %s\n" | 117 sys.stderr.write(" %d\n"%int(dtime[:-3])) |
136 " xurl: %s\n" | 118 |
137 " durl: %s\n" | 119 dkey, ddate, durl, dtime = nextDate(df,2) |
138 "DFQ: %s\n" | 120 continue |
139 "k1, k2: |%s|%s|\n" | 121 else: |
140 "FN: %s XCNT: %s DCNT: %s\n" | 122 if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): |
141 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, | 123 # we've missed something, disaster looms |
142 (b'\n '.join(DFQ)).decode('ascii'), | 124 print("Fail2:" |
143 xkey1, xkey2, FN, XCNT, DCNT, xl), | |
144 file=sys.stderr) | |
145 # fall through to the ordinary (non-messy) match case | |
146 else: | |
147 # still looking, save if >= date else fall through to write | |
148 if DEBUG>1: | |
149 print("Diso: match: %s\n" | |
150 " xkey: %s\n" | |
151 " dkey: %s\n" | |
152 " xdate: %s\n" | |
153 " ddate: %s\n" | |
154 " xurl: %s\n" | |
155 " durl: %s\n" | |
156 "xl: %s"%(not noMatch, | |
157 xkey, dkey, xdate, ddate, xurl, durl, xl), | |
158 file=sys.stderr) | |
159 if (dkey.startswith(xkey1) and | |
160 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): | |
161 xq.append(xl) | |
162 if DEBUG>1: | |
163 sys.stderr.write('xpush\n') | |
164 continue | |
165 # else fall through | |
166 if (ddate != xdate or | |
167 not dkey.startswith(xkey1) or | |
168 (xkey2 is not None and dkey!=xkey2) or | |
169 durl!=xurl): | |
170 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): | |
171 | |
172 print("Fail2: md: %s mu: %s\n" | |
173 " xkey: %s\n" | 125 " xkey: %s\n" |
174 " dkey: %s\n" | 126 " dkey: %s\n" |
175 " xdate: %s\n" | 127 " xdate: %s\n" |
176 " ddate: %s\n" | 128 " ddate: %s\n" |
177 " xurl: %s\n" | 129 " xurl: %s\n" |
178 " durl: %s\n" | 130 " durl: %s\n" |
179 "DFQ: %s\n" | |
180 "k1, k2: |%s|%s|\n" | |
181 "FN: %s XCNT: %s DCNT: %s\n" | 131 "FN: %s XCNT: %s DCNT: %s\n" |
182 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, | 132 "xl: %s"%(xkey, dkey, xdate, ddate, |
183 xurl, durl, | 133 xurl, durl, |
184 (b'\n '.join(DFQ)).decode('ascii'), | 134 FN, XCNT, DCNT, xl), |
185 xkey1, xkey2, FN, XCNT, DCNT, xl), | |
186 file=sys.stderr) | 135 file=sys.stderr) |
187 # try to force recovery | 136 # try to force recovery |
188 dkey, ddate, durl, dtime, messyD = nextDate(df,3) | 137 dkey, ddate, durl, dtime = nextDate(df,3) |
189 NF.write(xl) | 138 continue |
190 if DEBUG>1: | 139 # else fall through to write |
191 sys.stderr.write("out_nl\n") | 140 NF.write(xl) |
192 continue | 141 if DEBUG: |
193 # Got it | 142 sys.stderr.write("out_nl\n") |
194 NF.write(xkey) | |
195 NF.write(b' ') | |
196 NF.write(xdate) | |
197 NF.write(b' ') | |
198 NF.write(xprops[:-2]) | |
199 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | |
200 if DEBUG>1: | |
201 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') | |
202 for xp in (xkey, xdate, xurl)))) | |
203 sys.stderr.write(" %d\n"%int(dtime[:-3])) | |
204 | |
205 dkey, ddate, durl, dtime, messyD = nextDate(df,2) |