Mercurial > hg > cc > cirrus_work
diff bin/merge_date.py @ 100:18446a7eeb9e
rework handling of session key problem
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 11 Sep 2023 12:56:47 +0100 |
parents | 009e633eb804 |
children | e2e64c3d763e |
line wrap: on
line diff
--- a/bin/merge_date.py Fri Sep 08 21:40:52 2023 +0100 +++ b/bin/merge_date.py Mon Sep 11 12:56:47 2023 +0100 @@ -10,11 +10,11 @@ import sys, io, os, os.path, time, re from isal import igzip -if sys.argv[1] == '-d': + +DEBUG = 0 +while sys.argv[1] == '-d': sys.argv.pop(1) - DEBUG = True -else: - DEBUG = False + DEBUG += 1 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] @@ -24,6 +24,8 @@ SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' b'=[^&]*)') +ISESSION = re.compile(SESSION.pattern,flags=re.I) +URL=re.compile(b'\{"url": "([^"]*)"') # Above based on this from fixed Java code: #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), @@ -44,16 +46,17 @@ XF = igzip.IGzipFile(filename=XPATH%0) NF = open(NN:=(NPATH%0),'wb') -def nextLine(): +def nextLine(xq, messyD): '''Move on to next index file if current has run out''' global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT + if xq and not messyD: + return xq.pop(0), xq while True: xl=XF.readline() XCNT += 1 if xl == b'': # need to move to next index file FN += 1 - DCNT=0 # this is relative to FN XF.close() NF.close() print(NN, flush=True) # so we can compress it @@ -65,7 +68,7 @@ NF = open((NN:=NPATH%FN), 'wb') xl = XF.readline() XCNT = 1 - return xl + return xl, xq def keys(key): '''Deal with failure of 2019-35-vintage Java fixup to detect @@ -82,58 +85,97 @@ return False, key, None dfq = [] # for reordering if needed +messyD = False with open(sys.argv[1], 'rb') as df: dl = df.readline() DCNT = 1 - dkey, ddate, dtime = dl.split(b'\t') + if DEBUG>1: + sys.stderr.write("dl1: %s"%dl.decode('ascii')) + dkey, ddate, durl, dtime = dl.split(b'\t') + messyD = ISESSION.search(durl) - while (xl:=nextLine()) is not None: + xq = [] + + while (nlRes := nextLine(xq, messyD))[0] is not None: + (xl, xq) = nlRes xkey, xdate, xprops = xl.split(b' ', maxsplit=2) - messy, xkey1, xkey2 = keys(xkey) - if messy: - stale=dfq - dfq=[] - while (dkey.startswith(xkey1) and - (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): - dfq.append(dl) - if stale: - dl = stale.pop(0) - else: - dl = df.readline() - DCNT += 1 - dkey, ddate, dtime = dl.split(b'\t') + m = URL.match(xprops) + if m: + xurl = m[1] + else: + raise ValueError("No url in %s"%xprops) + if DEBUG>1: + sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + messyU, xkey1, xkey2 = keys(xkey) + if messyD: + if messyU: + # better match + if (ddate != xdate or + not dkey.startswith(xkey1) or + dkey!=xkey1 or + durl!=xurl): + raise ValueError("Fail: xkey: %s\n" + " dkey: %s\n" + " xdate: %s\n" + " ddate: %s\n" + " xurl: %s\n" + " durl: %s\n" + "dfq: %s\n" + "k1, k2: |%s|%s|\n" + "FN: %s XCNT: %s DCNT: %s\n" + "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, + (b'\n '.join(dfq)).decode('ascii'), + xkey1, xkey2, FN, XCNT, DCNT, xl)) + messyD = False + # fall through to the ordinary (non-messy) match case + else: + # still looking, save this one + if DEBUG: + print("Diso: xkey: %s\n" + " dkey: %s\n" + " xdate: %s\n" + " ddate: %s\n" + " xurl: %s\n" + " durl: %s\n" + "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl), + file=sys.stderr) + xq.append(xl) + if DEBUG>1: + sys.stderr.write('xpush\n') + continue + else: + # Not messyD + if messyU: + raise ValueError("messyD w/o messyU") if (ddate != xdate or - not dkey.startswith(xkey1) or - (xkey2 is not None and dkey!=xkey2)): - if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): - print("Fail: xkey: %s\n" - " dkey: %s\n" - " xdate: %s\n" - " ddate: %s\n" - "dfq: %s\n" - "k1, k2: |%s|%s|\n" - "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(xkey, dkey, xdate, ddate, - (b'\n '.join(dfq)).decode('ascii'), - xkey1, xkey2, FN, XCNT, DCNT, xl), - file=sys.stderr) - raise ValueError + not dkey.startswith(xkey1) or + (xkey2 is not None and dkey!=xkey2) or + durl!=xurl): NF.write(xl) continue + # Got it NF.write(xkey) NF.write(b' ') NF.write(xdate) NF.write(b' ') NF.write(xprops[:-2]) NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) + if DEBUG>1: + sys.stderr.write("out: %s"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + sys.stderr.write(" %d\n"%int(dtime[:-3])) dl = df.readline() if dl == '': if dfq: if DEBUG: - breakpoint() + raise ValueError # write out the last of the last index file, if any - dkey = ddate = "" + dkey = ddate = durl = "" else: + if DEBUG>1: + sys.stderr.write("dl3: %s"%dl.decode('ascii')) DCNT += 1 - dkey, ddate, dtime = dl.split(b'\t') + dkey, ddate, durl, dtime = dl.split(b'\t') + messyD = ISESSION.search(durl)