Mercurial > hg > cc > cirrus_work
view bin/merge_date.py @ 115:0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
get rid of DFQ and xq,
big simplification and refactor as a result,
fix bug in date stream eof handling
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 26 Sep 2023 17:42:57 +0100 |
parents | 4a52585a1aac |
children | f52783faf3ee |
line wrap: on
line source
#!/usr/bin/python3 '''Add timestamps from Last-Modified-dated (ks.tsv) files into that year's index Usage: merge_date.py ksvstream cdx-dir outdir ksvstream consists of tab-separated key, CC date, url and Unix timestamp ''' # ' import sys, io, os, os.path, time, re from isal import igzip DEBUG = 0 while sys.argv[1] == '-d': sys.argv.pop(1) DEBUG += 1 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' b'(crawldiagnostics|robotstxt)/') SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' b'=[^&]*)') ISESSION = re.compile(SESSION.pattern,flags=re.I) URL=re.compile(b'\{"url": "([^"]*)"') WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') # Above based on this from broken Java code: # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", #print(sys.argv[3],NPATH,file=sys.stderr) os.makedirs(sys.argv[3], exist_ok=True) FN = 0 XCNT = 0 DCNT = 0 XF = igzip.IGzipFile(filename=XPATH%0) NF = open(NN:=(NPATH%0),'wb') def nextLine(): '''Move on to next index file if current has run out''' global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT while True: xl=XF.readline() XCNT += 1 if xl == b'': # need to move to next index file FN += 1 XF.close() NF.close() print(NN, flush=True) # so we can compress it time.sleep(0.1) # so it flushes? XN=XPATH%FN if not os.path.exists(XN): return None XF = igzip.IGzipFile(filename=XN) NF = open((NN:=NPATH%FN), 'wb') xl = XF.readline() XCNT = 1 if WARC.search(xl): return xl else: NF.write(xl) if DEBUG: sys.stderr.write("out_rc\n") def nextDate(df,dn): global DEBUG, DCNT, XCNT dl = df.readline() if dl == b'': # write out the last of the last index file, if any return "", "", "", 0 if DEBUG: sys.stderr.write("dl%s: %s\n"%(dn,dl)) dkey, ddate, durl, dtime = dl.split(b'\t') DCNT += 1 return dkey, ddate, durl, dtime with open(sys.argv[1], 'rb') as df: DCNT = 0 dkey, ddate, durl, dtime = nextDate(df,1) while (xl := nextLine())[0] is not None: xkey, xdate, xprops = xl.split(b' ', maxsplit=2) m = URL.match(xprops) if m: xurl = m[1] else: raise ValueError("No url in %s"%xprops) if DEBUG: sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') for xp in (xkey, xdate, xurl)))) if dkey==xkey and ddate==xdate and durl==xurl: # Got it NF.write(xkey) NF.write(b' ') NF.write(xdate) NF.write(b' ') NF.write(xprops[:-2]) NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) if DEBUG: sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') for xp in (xkey, xdate, xurl)))) sys.stderr.write(" %d\n"%int(dtime[:-3])) dkey, ddate, durl, dtime = nextDate(df,2) continue else: if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): # we've missed something, disaster looms print("Fail2:" " xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" " xurl: %s\n" " durl: %s\n" "FN: %s XCNT: %s DCNT: %s\n" "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, FN, XCNT, DCNT, xl), file=sys.stderr) # try to force recovery dkey, ddate, durl, dtime = nextDate(df,3) continue # else fall through to write NF.write(xl) if DEBUG: sys.stderr.write("out_nl\n")