# HG changeset patch # User Henry S. Thompson # Date 1693931566 -3600 # Node ID 460f0599e8cd0ba7e655e3c02830e7ed93e858f6 # Parent c1a70532444c3f8f91f99feea0db2e33a58fade9 mostly working, but need to reorder in case of cfid and friends diff -r c1a70532444c -r 460f0599e8cd bin/merge_date.py --- a/bin/merge_date.py Thu Aug 31 14:14:21 2023 +0100 +++ b/bin/merge_date.py Tue Sep 05 17:32:46 2023 +0100 @@ -7,11 +7,30 @@ ksvstream consists of tab-separated key, CC date and Unix timestamp ''' # ' -import sys, io, os, os.path +import sys, io, os, os.path, time, re from isal import igzip +if sys.argv[1] == '-d': + sys.argv.pop(1) + DEBUG = True +else: + DEBUG = False + XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] -NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3] +NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] + +RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' +b'(crawldiagnostics|robotstxt)/') +SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' + b'sid|jsessionid|aspsessionid[a-z]*)' + b'=[^&]*)') + +# Above based on this from fixed Java code: +#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", #print(sys.argv[3],NPATH,file=sys.stderr) @@ -19,35 +38,72 @@ FN = 0 +XCNT = 0 +dcnt = 0 + XF = igzip.IGzipFile(filename=XPATH%0) -NF = open(NPATH%0,'wb') - -XL = b'' +NF = open(NN:=(NPATH%0),'wb') def nextLine(): - global FN, NF, NPATH, XF, XPATH - xl=XF.readline() - if xl == b'': - # need to move to next index file - if NF is None: - FN = 0 - else: + '''Move on to next index file if current has run out''' + global FN, NF, NPATH, NN, XF, XPATH, XCNT + while True: + xl=XF.readline() + XCNT += 1 + if xl == b'': + # need to move to next index file FN += 1 - xn=XPATH%FN - if not os.path.exists(xn): - return - XF = igzip.IGzipFile(filename=xn) - NF = open(NPATH%FN, 'wb') - xl = XF.readline() - return xl + XF.close() + NF.close() + print(NN, flush=True) # so we can compress it + time.sleep(0.1) # so it flushes? + XN=XPATH%FN + if not os.path.exists(XN): + return + XF = igzip.IGzipFile(filename=XN) + NF = open((NN:=NPATH%FN), 'wb') + xl = XF.readline() + XCNT = 1 + if RorDPAT.search(xl): + #print(xl,file=sys.stderr) + continue + return xl + +def keys(key): + '''Deal with failure of 2019-35-vintage Java fixup to detect + parameter-part-initial session ids''' + if m:=SESSION.match(key): + prefix=m[1] + e, b = m.span(2) + fixed=key[:e]+key[b:] + if fixed==m[1]: + return prefix[:-1], None + else: + return prefix, fixed + else: + return key, None with open(sys.argv[1], 'rb') as df: dl = df.readline() - (dkey, ddate, dtime) = dl.split(b'\t') + dcnt += 1 + dkey, ddate, dtime = dl.split(b'\t') while (xl:=nextLine()) is not None: - (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) - if dkey != xkey or ddate != xdate: + xkey, xdate, xprops = xl.split(b' ', maxsplit=2) + xkey1, xkey2 = keys(xkey) + if (ddate != xdate or + not dkey.startswith(xkey1) or + (xkey2 is not None and dkey!=xkey2)): + if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')): + print("Fail: xkey: %s\n" + " dkey: %s\n" + " xdate: %s\n" + " ddate: %s\n" + "k1, k2: |%s|%s|\n" + "FN: %s\n" + "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, xl), + file=sys.stderr) + raise ValueError() NF.write(xl) continue NF.write(xkey) @@ -61,4 +117,5 @@ # write out the last of the last index file, if any dkey = ddate = None else: - (dkey, ddate, dtime) = dl.split(b'\t') + dcnt += 1 + dkey, ddate, dtime = dl.split(b'\t')