# HG changeset patch # User Henry S. Thompson # Date 1695715427 -3600 # Node ID 4a52585a1aac356eb1d82c289d6ec2ee5ca2a0c8 # Parent 827eadc721227af6f60d92183e67344258a8be93 refactor datestream reading, fix pattern ordering in SESSION diff -r 827eadc72122 -r 4a52585a1aac bin/merge_date.py --- a/bin/merge_date.py Mon Sep 25 23:53:13 2023 +0100 +++ b/bin/merge_date.py Tue Sep 26 09:03:47 2023 +0100 @@ -4,7 +4,7 @@ Usage: merge_date.py ksvstream cdx-dir outdir -ksvstream consists of tab-separated key, CC date and Unix timestamp +ksvstream consists of tab-separated key, CC date, url and Unix timestamp ''' # ' import sys, io, os, os.path, time, re @@ -22,7 +22,7 @@ RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' b'(crawldiagnostics|robotstxt)/') SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' - b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' + b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' b'=[^&]*)') ISESSION = re.compile(SESSION.pattern,flags=re.I) URL=re.compile(b'\{"url": "([^"]*)"') @@ -85,16 +85,28 @@ else: return False, key, None -dfq = [] # for reordering if needed +DFQ = [] # for reordering if needed messyD = False +def nextDate(df,dn): + global DEBUG, DFQ, DCNT, ISESSION + dl = df.readline() + if dl == b'': + if DFQ: + if DEBUG: + raise ValueError("EOF but non-empty DFQ: %s"%DFQ) + # write out the last of the last index file, if any + return "", "", "", 0, False + if DEBUG>1: + sys.stderr.write("dl%s: %s\n"%(dn,dl)) + dkey, ddate, durl, dtime = dl.split(b'\t') + messyD = ISESSION.search(durl) + DCNT += 1 + return dkey, ddate, durl, dtime, messyD + with open(sys.argv[1], 'rb') as df: - dl = df.readline() - DCNT = 1 - if DEBUG>1: - sys.stderr.write("dl1: %s"%dl.decode('ascii')) - dkey, ddate, durl, dtime = dl.split(b'\t') - messyD = ISESSION.search(durl) + DCNT = 0 + dkey, ddate, durl, dtime, messyD = nextDate(df,1) xq = [] @@ -116,19 +128,20 @@ if messyU: # better match if noMatch: - raise ValueError("Fail1: md: %s mu: %s\n" + print("Fail1: md: %s mu: %s\n" " xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" " xurl: %s\n" " durl: %s\n" - "dfq: %s\n" + "DFQ: %s\n" "k1, k2: |%s|%s|\n" "FN: %s XCNT: %s DCNT: %s\n" "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, - (b'\n '.join(dfq)).decode('ascii'), - xkey1, xkey2, FN, XCNT, DCNT, xl)) + (b'\n '.join(DFQ)).decode('ascii'), + xkey1, xkey2, FN, XCNT, DCNT, xl), + file=sys.stderr) # fall through to the ordinary (non-messy) match case else: # still looking, save if >= date else fall through to write @@ -156,20 +169,23 @@ durl!=xurl): if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): - raise ValueError("Fail2: md: %s mu: %s\n" + print("Fail2: md: %s mu: %s\n" " xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" " xurl: %s\n" " durl: %s\n" - "dfq: %s\n" + "DFQ: %s\n" "k1, k2: |%s|%s|\n" "FN: %s XCNT: %s DCNT: %s\n" "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, - (b'\n '.join(dfq)).decode('ascii'), - xkey1, xkey2, FN, XCNT, DCNT, xl)) + (b'\n '.join(DFQ)).decode('ascii'), + xkey1, xkey2, FN, XCNT, DCNT, xl), + file=sys.stderr) + # try to force recovery + dkey, ddate, durl, dtime, messyD = nextDate(df,3) NF.write(xl) if DEBUG>1: sys.stderr.write("out_nl\n") @@ -185,16 +201,5 @@ sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') for xp in (xkey, xdate, xurl)))) sys.stderr.write(" %d\n"%int(dtime[:-3])) - dl = df.readline() - if dl == '': - if dfq: - if DEBUG: - raise ValueError - # write out the last of the last index file, if any - dkey = ddate = durl = "" - else: - if DEBUG>1: - sys.stderr.write("dl3: %s"%dl.decode('ascii')) - DCNT += 1 - dkey, ddate, durl, dtime = dl.split(b'\t') - messyD = ISESSION.search(durl) + + dkey, ddate, durl, dtime, messyD = nextDate(df,2)