Mercurial > hg > cc > cirrus_work
view bin/merge_date.py @ 94:009e633eb804
last version before giving up on approach based only on key and datestamp
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 Sep 2023 18:03:55 +0100 |
parents | 25bd398a8035 |
children | 18446a7eeb9e |
line wrap: on
line source
#!/usr/bin/python3 '''Add timestamps from Last-Modified-dated (ks.tsv) files into that year's index Usage: merge_date.py ksvstream cdx-dir outdir ksvstream consists of tab-separated key, CC date and Unix timestamp ''' # ' import sys, io, os, os.path, time, re from isal import igzip if sys.argv[1] == '-d': sys.argv.pop(1) DEBUG = True else: DEBUG = False XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' b'(crawldiagnostics|robotstxt)/') SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' b'=[^&]*)') # Above based on this from fixed Java code: #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", #print(sys.argv[3],NPATH,file=sys.stderr) os.makedirs(sys.argv[3], exist_ok=True) FN = 0 XCNT = 0 DCNT = 0 XF = igzip.IGzipFile(filename=XPATH%0) NF = open(NN:=(NPATH%0),'wb') def nextLine(): '''Move on to next index file if current has run out''' global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT while True: xl=XF.readline() XCNT += 1 if xl == b'': # need to move to next index file FN += 1 DCNT=0 # this is relative to FN XF.close() NF.close() print(NN, flush=True) # so we can compress it time.sleep(0.1) # so it flushes? XN=XPATH%FN if not os.path.exists(XN): return XF = igzip.IGzipFile(filename=XN) NF = open((NN:=NPATH%FN), 'wb') xl = XF.readline() XCNT = 1 return xl def keys(key): '''Deal with failure of 2019-35-vintage Java fixup to detect parameter-part-initial session ids''' if m:=SESSION.match(key): prefix=m[1] e, b = m.span(2) fixed=key[:e]+key[b:] if fixed==m[1]: return True, prefix[:-1], None else: return True, prefix, fixed else: return False, key, None dfq = [] # for reordering if needed with open(sys.argv[1], 'rb') as df: dl = df.readline() DCNT = 1 dkey, ddate, dtime = dl.split(b'\t') while (xl:=nextLine()) is not None: xkey, xdate, xprops = xl.split(b' ', maxsplit=2) messy, xkey1, xkey2 = keys(xkey) if messy: stale=dfq dfq=[] while (dkey.startswith(xkey1) and (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): dfq.append(dl) if stale: dl = stale.pop(0) else: dl = df.readline() DCNT += 1 dkey, ddate, dtime = dl.split(b'\t') if (ddate != xdate or not dkey.startswith(xkey1) or (xkey2 is not None and dkey!=xkey2)): if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): print("Fail: xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" "dfq: %s\n" "k1, k2: |%s|%s|\n" "FN: %s XCNT: %s DCNT: %s\n" "xl: %s"%(xkey, dkey, xdate, ddate, (b'\n '.join(dfq)).decode('ascii'), xkey1, xkey2, FN, XCNT, DCNT, xl), file=sys.stderr) raise ValueError NF.write(xl) continue NF.write(xkey) NF.write(b' ') NF.write(xdate) NF.write(b' ') NF.write(xprops[:-2]) NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) dl = df.readline() if dl == '': if dfq: if DEBUG: breakpoint() # write out the last of the last index file, if any dkey = ddate = "" else: DCNT += 1 dkey, ddate, dtime = dl.split(b'\t')