comparison bin/merge_date.py @ 115:0b1e6e134aca

robotstxt and crawldiagnostics get free ride, get rid of DFQ and xq, big simplification and refactor as a result, fix bug in date stream eof handling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 26 Sep 2023 17:42:57 +0100
parents 4a52585a1aac
children f52783faf3ee
comparison
equal deleted inserted replaced
114:5818d79c4ec9 115:0b1e6e134aca
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' 24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' 25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
26 b'=[^&]*)') 26 b'=[^&]*)')
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) 27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
28 URL=re.compile(b'\{"url": "([^"]*)"') 28 URL=re.compile(b'\{"url": "([^"]*)"')
29 WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
29 30
30 # Above based on this from broken Java code: 31 # Above based on this from broken Java code:
31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 32 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), 33 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), 34 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
45 DCNT = 0 46 DCNT = 0
46 47
47 XF = igzip.IGzipFile(filename=XPATH%0) 48 XF = igzip.IGzipFile(filename=XPATH%0)
48 NF = open(NN:=(NPATH%0),'wb') 49 NF = open(NN:=(NPATH%0),'wb')
49 50
50 def nextLine(xq, messyD): 51 def nextLine():
51 '''Move on to next index file if current has run out''' 52 '''Move on to next index file if current has run out'''
52 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT 53 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
53 if xq and not messyD:
54 return xq.pop(0), xq
55 while True: 54 while True:
56 xl=XF.readline() 55 xl=XF.readline()
57 XCNT += 1 56 XCNT += 1
58 if xl == b'': 57 if xl == b'':
59 # need to move to next index file 58 # need to move to next index file
62 NF.close() 61 NF.close()
63 print(NN, flush=True) # so we can compress it 62 print(NN, flush=True) # so we can compress it
64 time.sleep(0.1) # so it flushes? 63 time.sleep(0.1) # so it flushes?
65 XN=XPATH%FN 64 XN=XPATH%FN
66 if not os.path.exists(XN): 65 if not os.path.exists(XN):
67 return (None, None) 66 return None
68 XF = igzip.IGzipFile(filename=XN) 67 XF = igzip.IGzipFile(filename=XN)
69 NF = open((NN:=NPATH%FN), 'wb') 68 NF = open((NN:=NPATH%FN), 'wb')
70 xl = XF.readline() 69 xl = XF.readline()
71 XCNT = 1 70 XCNT = 1
72 return xl, xq 71 if WARC.search(xl):
72 return xl
73 else:
74 NF.write(xl)
75 if DEBUG:
76 sys.stderr.write("out_rc\n")
73 77
74 def keys(key):
75 '''Deal with failure of 2019-35-vintage Java fixup to detect
76 parameter-part-initial session ids'''
77 if m:=SESSION.match(key):
78 prefix=m[1]
79 e, b = m.span(2)
80 fixed=key[:e]+key[b:]
81 if fixed==m[1]:
82 return True, prefix[:-1], None
83 else:
84 return True, prefix, fixed
85 else:
86 return False, key, None
87
88 DFQ = [] # for reordering if needed
89 messyD = False
90 78
91 def nextDate(df,dn): 79 def nextDate(df,dn):
92 global DEBUG, DFQ, DCNT, ISESSION 80 global DEBUG, DCNT, XCNT
93 dl = df.readline() 81 dl = df.readline()
94 if dl == b'': 82 if dl == b'':
95 if DFQ:
96 if DEBUG:
97 raise ValueError("EOF but non-empty DFQ: %s"%DFQ)
98 # write out the last of the last index file, if any 83 # write out the last of the last index file, if any
99 return "", "", "", 0, False 84 return "", "", "", 0
100 if DEBUG>1: 85 if DEBUG:
101 sys.stderr.write("dl%s: %s\n"%(dn,dl)) 86 sys.stderr.write("dl%s: %s\n"%(dn,dl))
102 dkey, ddate, durl, dtime = dl.split(b'\t') 87 dkey, ddate, durl, dtime = dl.split(b'\t')
103 messyD = ISESSION.search(durl)
104 DCNT += 1 88 DCNT += 1
105 return dkey, ddate, durl, dtime, messyD 89 return dkey, ddate, durl, dtime
106 90
107 with open(sys.argv[1], 'rb') as df: 91 with open(sys.argv[1], 'rb') as df:
108 DCNT = 0 92 DCNT = 0
109 dkey, ddate, durl, dtime, messyD = nextDate(df,1)
110 93
111 xq = [] 94 dkey, ddate, durl, dtime = nextDate(df,1)
112 95
113 while (nlRes := nextLine(xq, messyD))[0] is not None: 96 while (xl := nextLine())[0] is not None:
114 (xl, xq) = nlRes
115 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) 97 xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
116 m = URL.match(xprops) 98 m = URL.match(xprops)
117 if m: 99 if m:
118 xurl = m[1] 100 xurl = m[1]
119 else: 101 else:
120 raise ValueError("No url in %s"%xprops) 102 raise ValueError("No url in %s"%xprops)
121 if DEBUG>1: 103 if DEBUG:
122 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') 104 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
123 for xp in (xkey, xdate, xurl)))) 105 for xp in (xkey, xdate, xurl))))
124 messyU, xkey1, xkey2 = keys(xkey) 106 if dkey==xkey and ddate==xdate and durl==xurl:
125 if messyD: 107 # Got it
126 noMatch = (not dkey.startswith(xkey1) or 108 NF.write(xkey)
127 (xkey2 is not None and dkey!=xkey2)) 109 NF.write(b' ')
128 if messyU: 110 NF.write(xdate)
129 # better match 111 NF.write(b' ')
130 if noMatch: 112 NF.write(xprops[:-2])
131 print("Fail1: md: %s mu: %s\n" 113 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
132 " xkey: %s\n" 114 if DEBUG:
133 " dkey: %s\n" 115 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
134 " xdate: %s\n" 116 for xp in (xkey, xdate, xurl))))
135 " ddate: %s\n" 117 sys.stderr.write(" %d\n"%int(dtime[:-3]))
136 " xurl: %s\n" 118
137 " durl: %s\n" 119 dkey, ddate, durl, dtime = nextDate(df,2)
138 "DFQ: %s\n" 120 continue
139 "k1, k2: |%s|%s|\n" 121 else:
140 "FN: %s XCNT: %s DCNT: %s\n" 122 if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
141 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, 123 # we've missed something, disaster looms
142 (b'\n '.join(DFQ)).decode('ascii'), 124 print("Fail2:"
143 xkey1, xkey2, FN, XCNT, DCNT, xl),
144 file=sys.stderr)
145 # fall through to the ordinary (non-messy) match case
146 else:
147 # still looking, save if >= date else fall through to write
148 if DEBUG>1:
149 print("Diso: match: %s\n"
150 " xkey: %s\n"
151 " dkey: %s\n"
152 " xdate: %s\n"
153 " ddate: %s\n"
154 " xurl: %s\n"
155 " durl: %s\n"
156 "xl: %s"%(not noMatch,
157 xkey, dkey, xdate, ddate, xurl, durl, xl),
158 file=sys.stderr)
159 if (dkey.startswith(xkey1) and
160 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
161 xq.append(xl)
162 if DEBUG>1:
163 sys.stderr.write('xpush\n')
164 continue
165 # else fall through
166 if (ddate != xdate or
167 not dkey.startswith(xkey1) or
168 (xkey2 is not None and dkey!=xkey2) or
169 durl!=xurl):
170 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
171
172 print("Fail2: md: %s mu: %s\n"
173 " xkey: %s\n" 125 " xkey: %s\n"
174 " dkey: %s\n" 126 " dkey: %s\n"
175 " xdate: %s\n" 127 " xdate: %s\n"
176 " ddate: %s\n" 128 " ddate: %s\n"
177 " xurl: %s\n" 129 " xurl: %s\n"
178 " durl: %s\n" 130 " durl: %s\n"
179 "DFQ: %s\n"
180 "k1, k2: |%s|%s|\n"
181 "FN: %s XCNT: %s DCNT: %s\n" 131 "FN: %s XCNT: %s DCNT: %s\n"
182 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, 132 "xl: %s"%(xkey, dkey, xdate, ddate,
183 xurl, durl, 133 xurl, durl,
184 (b'\n '.join(DFQ)).decode('ascii'), 134 FN, XCNT, DCNT, xl),
185 xkey1, xkey2, FN, XCNT, DCNT, xl),
186 file=sys.stderr) 135 file=sys.stderr)
187 # try to force recovery 136 # try to force recovery
188 dkey, ddate, durl, dtime, messyD = nextDate(df,3) 137 dkey, ddate, durl, dtime = nextDate(df,3)
189 NF.write(xl) 138 continue
190 if DEBUG>1: 139 # else fall through to write
191 sys.stderr.write("out_nl\n") 140 NF.write(xl)
192 continue 141 if DEBUG:
193 # Got it 142 sys.stderr.write("out_nl\n")
194 NF.write(xkey)
195 NF.write(b' ')
196 NF.write(xdate)
197 NF.write(b' ')
198 NF.write(xprops[:-2])
199 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
200 if DEBUG>1:
201 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
202 for xp in (xkey, xdate, xurl))))
203 sys.stderr.write(" %d\n"%int(dtime[:-3]))
204
205 dkey, ddate, durl, dtime, messyD = nextDate(df,2)