Mercurial > hg > cc > cirrus_work
comparison bin/merge_date.py @ 113:4a52585a1aac
refactor datestream reading,
fix pattern ordering in SESSION
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 26 Sep 2023 09:03:47 +0100 |
parents | 52c6a9b0fc8c |
children | 0b1e6e134aca |
comparison
equal
deleted
inserted
replaced
112:827eadc72122 | 113:4a52585a1aac |
---|---|
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into | 2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into |
3 that year's index | 3 that year's index |
4 | 4 |
5 Usage: merge_date.py ksvstream cdx-dir outdir | 5 Usage: merge_date.py ksvstream cdx-dir outdir |
6 | 6 |
7 ksvstream consists of tab-separated key, CC date and Unix timestamp | 7 ksvstream consists of tab-separated key, CC date, url and Unix timestamp |
8 ''' # ' | 8 ''' # ' |
9 | 9 |
10 import sys, io, os, os.path, time, re | 10 import sys, io, os, os.path, time, re |
11 from isal import igzip | 11 from isal import igzip |
12 | 12 |
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] | 20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
21 | 21 |
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' | 22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' |
23 b'(crawldiagnostics|robotstxt)/') | 23 b'(crawldiagnostics|robotstxt)/') |
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' | 24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' | 25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' |
26 b'=[^&]*)') | 26 b'=[^&]*)') |
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) | 27 ISESSION = re.compile(SESSION.pattern,flags=re.I) |
28 URL=re.compile(b'\{"url": "([^"]*)"') | 28 URL=re.compile(b'\{"url": "([^"]*)"') |
29 | 29 |
30 # Above based on this from broken Java code: | 30 # Above based on this from broken Java code: |
83 else: | 83 else: |
84 return True, prefix, fixed | 84 return True, prefix, fixed |
85 else: | 85 else: |
86 return False, key, None | 86 return False, key, None |
87 | 87 |
88 dfq = [] # for reordering if needed | 88 DFQ = [] # for reordering if needed |
89 messyD = False | 89 messyD = False |
90 | 90 |
91 with open(sys.argv[1], 'rb') as df: | 91 def nextDate(df,dn): |
92 global DEBUG, DFQ, DCNT, ISESSION | |
92 dl = df.readline() | 93 dl = df.readline() |
93 DCNT = 1 | 94 if dl == b'': |
95 if DFQ: | |
96 if DEBUG: | |
97 raise ValueError("EOF but non-empty DFQ: %s"%DFQ) | |
98 # write out the last of the last index file, if any | |
99 return "", "", "", 0, False | |
94 if DEBUG>1: | 100 if DEBUG>1: |
95 sys.stderr.write("dl1: %s"%dl.decode('ascii')) | 101 sys.stderr.write("dl%s: %s\n"%(dn,dl)) |
96 dkey, ddate, durl, dtime = dl.split(b'\t') | 102 dkey, ddate, durl, dtime = dl.split(b'\t') |
97 messyD = ISESSION.search(durl) | 103 messyD = ISESSION.search(durl) |
104 DCNT += 1 | |
105 return dkey, ddate, durl, dtime, messyD | |
106 | |
107 with open(sys.argv[1], 'rb') as df: | |
108 DCNT = 0 | |
109 dkey, ddate, durl, dtime, messyD = nextDate(df,1) | |
98 | 110 |
99 xq = [] | 111 xq = [] |
100 | 112 |
101 while (nlRes := nextLine(xq, messyD))[0] is not None: | 113 while (nlRes := nextLine(xq, messyD))[0] is not None: |
102 (xl, xq) = nlRes | 114 (xl, xq) = nlRes |
114 noMatch = (not dkey.startswith(xkey1) or | 126 noMatch = (not dkey.startswith(xkey1) or |
115 (xkey2 is not None and dkey!=xkey2)) | 127 (xkey2 is not None and dkey!=xkey2)) |
116 if messyU: | 128 if messyU: |
117 # better match | 129 # better match |
118 if noMatch: | 130 if noMatch: |
119 raise ValueError("Fail1: md: %s mu: %s\n" | 131 print("Fail1: md: %s mu: %s\n" |
120 " xkey: %s\n" | 132 " xkey: %s\n" |
121 " dkey: %s\n" | 133 " dkey: %s\n" |
122 " xdate: %s\n" | 134 " xdate: %s\n" |
123 " ddate: %s\n" | 135 " ddate: %s\n" |
124 " xurl: %s\n" | 136 " xurl: %s\n" |
125 " durl: %s\n" | 137 " durl: %s\n" |
126 "dfq: %s\n" | 138 "DFQ: %s\n" |
127 "k1, k2: |%s|%s|\n" | 139 "k1, k2: |%s|%s|\n" |
128 "FN: %s XCNT: %s DCNT: %s\n" | 140 "FN: %s XCNT: %s DCNT: %s\n" |
129 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, | 141 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, |
130 (b'\n '.join(dfq)).decode('ascii'), | 142 (b'\n '.join(DFQ)).decode('ascii'), |
131 xkey1, xkey2, FN, XCNT, DCNT, xl)) | 143 xkey1, xkey2, FN, XCNT, DCNT, xl), |
144 file=sys.stderr) | |
132 # fall through to the ordinary (non-messy) match case | 145 # fall through to the ordinary (non-messy) match case |
133 else: | 146 else: |
134 # still looking, save if >= date else fall through to write | 147 # still looking, save if >= date else fall through to write |
135 if DEBUG>1: | 148 if DEBUG>1: |
136 print("Diso: match: %s\n" | 149 print("Diso: match: %s\n" |
154 not dkey.startswith(xkey1) or | 167 not dkey.startswith(xkey1) or |
155 (xkey2 is not None and dkey!=xkey2) or | 168 (xkey2 is not None and dkey!=xkey2) or |
156 durl!=xurl): | 169 durl!=xurl): |
157 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): | 170 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): |
158 | 171 |
159 raise ValueError("Fail2: md: %s mu: %s\n" | 172 print("Fail2: md: %s mu: %s\n" |
160 " xkey: %s\n" | 173 " xkey: %s\n" |
161 " dkey: %s\n" | 174 " dkey: %s\n" |
162 " xdate: %s\n" | 175 " xdate: %s\n" |
163 " ddate: %s\n" | 176 " ddate: %s\n" |
164 " xurl: %s\n" | 177 " xurl: %s\n" |
165 " durl: %s\n" | 178 " durl: %s\n" |
166 "dfq: %s\n" | 179 "DFQ: %s\n" |
167 "k1, k2: |%s|%s|\n" | 180 "k1, k2: |%s|%s|\n" |
168 "FN: %s XCNT: %s DCNT: %s\n" | 181 "FN: %s XCNT: %s DCNT: %s\n" |
169 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, | 182 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, |
170 xurl, durl, | 183 xurl, durl, |
171 (b'\n '.join(dfq)).decode('ascii'), | 184 (b'\n '.join(DFQ)).decode('ascii'), |
172 xkey1, xkey2, FN, XCNT, DCNT, xl)) | 185 xkey1, xkey2, FN, XCNT, DCNT, xl), |
186 file=sys.stderr) | |
187 # try to force recovery | |
188 dkey, ddate, durl, dtime, messyD = nextDate(df,3) | |
173 NF.write(xl) | 189 NF.write(xl) |
174 if DEBUG>1: | 190 if DEBUG>1: |
175 sys.stderr.write("out_nl\n") | 191 sys.stderr.write("out_nl\n") |
176 continue | 192 continue |
177 # Got it | 193 # Got it |
183 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | 199 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) |
184 if DEBUG>1: | 200 if DEBUG>1: |
185 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') | 201 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') |
186 for xp in (xkey, xdate, xurl)))) | 202 for xp in (xkey, xdate, xurl)))) |
187 sys.stderr.write(" %d\n"%int(dtime[:-3])) | 203 sys.stderr.write(" %d\n"%int(dtime[:-3])) |
188 dl = df.readline() | 204 |
189 if dl == '': | 205 dkey, ddate, durl, dtime, messyD = nextDate(df,2) |
190 if dfq: | |
191 if DEBUG: | |
192 raise ValueError | |
193 # write out the last of the last index file, if any | |
194 dkey = ddate = durl = "" | |
195 else: | |
196 if DEBUG>1: | |
197 sys.stderr.write("dl3: %s"%dl.decode('ascii')) | |
198 DCNT += 1 | |
199 dkey, ddate, durl, dtime = dl.split(b'\t') | |
200 messyD = ISESSION.search(durl) |