Mercurial > hg > cc > cirrus_work
changeset 93:25bd398a8035
improve reordering, still failing on cdx-00004
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 Sep 2023 18:51:21 +0100 |
parents | e56a7aad9ce9 |
children | 009e633eb804 |
files | bin/merge_date.py |
diffstat | 1 files changed, 32 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/merge_date.py Tue Sep 05 17:33:29 2023 +0100 +++ b/bin/merge_date.py Wed Sep 06 18:51:21 2023 +0100 @@ -22,7 +22,7 @@ RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' b'(crawldiagnostics|robotstxt)/') SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' - b'sid|jsessionid|aspsessionid[a-z]*)' + b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' b'=[^&]*)') # Above based on this from fixed Java code: @@ -39,20 +39,21 @@ FN = 0 XCNT = 0 -dcnt = 0 +DCNT = 0 XF = igzip.IGzipFile(filename=XPATH%0) NF = open(NN:=(NPATH%0),'wb') def nextLine(): '''Move on to next index file if current has run out''' - global FN, NF, NPATH, NN, XF, XPATH, XCNT + global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT while True: xl=XF.readline() XCNT += 1 if xl == b'': # need to move to next index file FN += 1 + DCNT=0 # this is relative to FN XF.close() NF.close() print(NN, flush=True) # so we can compress it @@ -64,9 +65,6 @@ NF = open((NN:=NPATH%FN), 'wb') xl = XF.readline() XCNT = 1 - if RorDPAT.search(xl): - #print(xl,file=sys.stderr) - continue return xl def keys(key): @@ -77,44 +75,50 @@ e, b = m.span(2) fixed=key[:e]+key[b:] if fixed==m[1]: - return prefix[:-1], None + return True, prefix[:-1], None else: - return prefix, fixed + return True, prefix, fixed else: - return key, None + return False, key, None dfq = [] # for reordering if needed with open(sys.argv[1], 'rb') as df: - if dfq: - dl = dfq.pop(0) - else: - dl = df.readline() - dcnt += 1 + dl = df.readline() + DCNT = 1 dkey, ddate, dtime = dl.split(b'\t') while (xl:=nextLine()) is not None: xkey, xdate, xprops = xl.split(b' ', maxsplit=2) - xkey1, xkey2 = keys(xkey) - if xkey2 is not None: - while dkey.startswith(xkey1) and dkey!=xkey2: + messy, xkey1, xkey2 = keys(xkey) + if messy: + stale=dfq + dfq=[] + while (dkey.startswith(xkey1) and + (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): dfq.append(dl) - dl = df.readline() - dcnt += 1 + if stale: + dl = stale.pop(0) + else: + dl = df.readline() + DCNT += 1 dkey, ddate, dtime = dl.split(b'\t') if (ddate != xdate or not dkey.startswith(xkey1) or (xkey2 is not None and dkey!=xkey2)): - if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')): + if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): print("Fail: xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" + "dfq: %s\n" "k1, k2: |%s|%s|\n" - "FN: %s dcnt: %s\n" - "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, dcnt, xl), + "FN: %s XCNT: %s DCNT: %s\n" + "xl: %s"%(xkey, dkey, xdate, ddate, + (b'\n '.join(dfq)).decode('ascii'), + xkey1, xkey2, FN, XCNT, DCNT, xl), file=sys.stderr) - raise ValueError() + breakpoint() NF.write(xl) continue NF.write(xkey) @@ -125,8 +129,11 @@ NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) dl = df.readline() if dl == '': + if dfq: + if DEBUG: + breakpoint() # write out the last of the last index file, if any - dkey = ddate = None + dkey = ddate = "" else: - dcnt += 1 + DCNT += 1 dkey, ddate, dtime = dl.split(b'\t')