comparison bin/merge_date.py @ 113:4a52585a1aac

refactor datestream reading, fix pattern ordering in SESSION
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 26 Sep 2023 09:03:47 +0100
parents 52c6a9b0fc8c
children 0b1e6e134aca
comparison
equal deleted inserted replaced
112:827eadc72122 113:4a52585a1aac
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into 2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into
3 that year's index 3 that year's index
4 4
5 Usage: merge_date.py ksvstream cdx-dir outdir 5 Usage: merge_date.py ksvstream cdx-dir outdir
6 6
7 ksvstream consists of tab-separated key, CC date and Unix timestamp 7 ksvstream consists of tab-separated key, CC date, url and Unix timestamp
8 ''' # ' 8 ''' # '
9 9
10 import sys, io, os, os.path, time, re 10 import sys, io, os, os.path, time, re
11 from isal import igzip 11 from isal import igzip
12 12
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] 20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
21 21
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' 22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
23 b'(crawldiagnostics|robotstxt)/') 23 b'(crawldiagnostics|robotstxt)/')
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' 24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' 25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
26 b'=[^&]*)') 26 b'=[^&]*)')
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) 27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
28 URL=re.compile(b'\{"url": "([^"]*)"') 28 URL=re.compile(b'\{"url": "([^"]*)"')
29 29
30 # Above based on this from broken Java code: 30 # Above based on this from broken Java code:
83 else: 83 else:
84 return True, prefix, fixed 84 return True, prefix, fixed
85 else: 85 else:
86 return False, key, None 86 return False, key, None
87 87
88 dfq = [] # for reordering if needed 88 DFQ = [] # for reordering if needed
89 messyD = False 89 messyD = False
90 90
91 with open(sys.argv[1], 'rb') as df: 91 def nextDate(df,dn):
92 global DEBUG, DFQ, DCNT, ISESSION
92 dl = df.readline() 93 dl = df.readline()
93 DCNT = 1 94 if dl == b'':
95 if DFQ:
96 if DEBUG:
97 raise ValueError("EOF but non-empty DFQ: %s"%DFQ)
98 # write out the last of the last index file, if any
99 return "", "", "", 0, False
94 if DEBUG>1: 100 if DEBUG>1:
95 sys.stderr.write("dl1: %s"%dl.decode('ascii')) 101 sys.stderr.write("dl%s: %s\n"%(dn,dl))
96 dkey, ddate, durl, dtime = dl.split(b'\t') 102 dkey, ddate, durl, dtime = dl.split(b'\t')
97 messyD = ISESSION.search(durl) 103 messyD = ISESSION.search(durl)
104 DCNT += 1
105 return dkey, ddate, durl, dtime, messyD
106
107 with open(sys.argv[1], 'rb') as df:
108 DCNT = 0
109 dkey, ddate, durl, dtime, messyD = nextDate(df,1)
98 110
99 xq = [] 111 xq = []
100 112
101 while (nlRes := nextLine(xq, messyD))[0] is not None: 113 while (nlRes := nextLine(xq, messyD))[0] is not None:
102 (xl, xq) = nlRes 114 (xl, xq) = nlRes
114 noMatch = (not dkey.startswith(xkey1) or 126 noMatch = (not dkey.startswith(xkey1) or
115 (xkey2 is not None and dkey!=xkey2)) 127 (xkey2 is not None and dkey!=xkey2))
116 if messyU: 128 if messyU:
117 # better match 129 # better match
118 if noMatch: 130 if noMatch:
119 raise ValueError("Fail1: md: %s mu: %s\n" 131 print("Fail1: md: %s mu: %s\n"
120 " xkey: %s\n" 132 " xkey: %s\n"
121 " dkey: %s\n" 133 " dkey: %s\n"
122 " xdate: %s\n" 134 " xdate: %s\n"
123 " ddate: %s\n" 135 " ddate: %s\n"
124 " xurl: %s\n" 136 " xurl: %s\n"
125 " durl: %s\n" 137 " durl: %s\n"
126 "dfq: %s\n" 138 "DFQ: %s\n"
127 "k1, k2: |%s|%s|\n" 139 "k1, k2: |%s|%s|\n"
128 "FN: %s XCNT: %s DCNT: %s\n" 140 "FN: %s XCNT: %s DCNT: %s\n"
129 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, 141 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
130 (b'\n '.join(dfq)).decode('ascii'), 142 (b'\n '.join(DFQ)).decode('ascii'),
131 xkey1, xkey2, FN, XCNT, DCNT, xl)) 143 xkey1, xkey2, FN, XCNT, DCNT, xl),
144 file=sys.stderr)
132 # fall through to the ordinary (non-messy) match case 145 # fall through to the ordinary (non-messy) match case
133 else: 146 else:
134 # still looking, save if >= date else fall through to write 147 # still looking, save if >= date else fall through to write
135 if DEBUG>1: 148 if DEBUG>1:
136 print("Diso: match: %s\n" 149 print("Diso: match: %s\n"
154 not dkey.startswith(xkey1) or 167 not dkey.startswith(xkey1) or
155 (xkey2 is not None and dkey!=xkey2) or 168 (xkey2 is not None and dkey!=xkey2) or
156 durl!=xurl): 169 durl!=xurl):
157 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): 170 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
158 171
159 raise ValueError("Fail2: md: %s mu: %s\n" 172 print("Fail2: md: %s mu: %s\n"
160 " xkey: %s\n" 173 " xkey: %s\n"
161 " dkey: %s\n" 174 " dkey: %s\n"
162 " xdate: %s\n" 175 " xdate: %s\n"
163 " ddate: %s\n" 176 " ddate: %s\n"
164 " xurl: %s\n" 177 " xurl: %s\n"
165 " durl: %s\n" 178 " durl: %s\n"
166 "dfq: %s\n" 179 "DFQ: %s\n"
167 "k1, k2: |%s|%s|\n" 180 "k1, k2: |%s|%s|\n"
168 "FN: %s XCNT: %s DCNT: %s\n" 181 "FN: %s XCNT: %s DCNT: %s\n"
169 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, 182 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
170 xurl, durl, 183 xurl, durl,
171 (b'\n '.join(dfq)).decode('ascii'), 184 (b'\n '.join(DFQ)).decode('ascii'),
172 xkey1, xkey2, FN, XCNT, DCNT, xl)) 185 xkey1, xkey2, FN, XCNT, DCNT, xl),
186 file=sys.stderr)
187 # try to force recovery
188 dkey, ddate, durl, dtime, messyD = nextDate(df,3)
173 NF.write(xl) 189 NF.write(xl)
174 if DEBUG>1: 190 if DEBUG>1:
175 sys.stderr.write("out_nl\n") 191 sys.stderr.write("out_nl\n")
176 continue 192 continue
177 # Got it 193 # Got it
183 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) 199 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
184 if DEBUG>1: 200 if DEBUG>1:
185 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') 201 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
186 for xp in (xkey, xdate, xurl)))) 202 for xp in (xkey, xdate, xurl))))
187 sys.stderr.write(" %d\n"%int(dtime[:-3])) 203 sys.stderr.write(" %d\n"%int(dtime[:-3]))
188 dl = df.readline() 204
189 if dl == '': 205 dkey, ddate, durl, dtime, messyD = nextDate(df,2)
190 if dfq:
191 if DEBUG:
192 raise ValueError
193 # write out the last of the last index file, if any
194 dkey = ddate = durl = ""
195 else:
196 if DEBUG>1:
197 sys.stderr.write("dl3: %s"%dl.decode('ascii'))
198 DCNT += 1
199 dkey, ddate, durl, dtime = dl.split(b'\t')
200 messyD = ISESSION.search(durl)