# HG changeset patch # User Henry S. Thompson # Date 1695746577 -3600 # Node ID 0b1e6e134acaf4c4b682ad44d9ce01b46587dbb3 # Parent 5818d79c4ec9287533728b0e48ce06e06854db1a robotstxt and crawldiagnostics get free ride, get rid of DFQ and xq, big simplification and refactor as a result, fix bug in date stream eof handling diff -r 5818d79c4ec9 -r 0b1e6e134aca bin/merge_date.py --- a/bin/merge_date.py Tue Sep 26 14:18:40 2023 +0100 +++ b/bin/merge_date.py Tue Sep 26 17:42:57 2023 +0100 @@ -26,6 +26,7 @@ b'=[^&]*)') ISESSION = re.compile(SESSION.pattern,flags=re.I) URL=re.compile(b'\{"url": "([^"]*)"') +WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') # Above based on this from broken Java code: # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 @@ -47,11 +48,9 @@ XF = igzip.IGzipFile(filename=XPATH%0) NF = open(NN:=(NPATH%0),'wb') -def nextLine(xq, messyD): +def nextLine(): '''Move on to next index file if current has run out''' global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT - if xq and not messyD: - return xq.pop(0), xq while True: xl=XF.readline() XCNT += 1 @@ -64,142 +63,80 @@ time.sleep(0.1) # so it flushes? XN=XPATH%FN if not os.path.exists(XN): - return (None, None) + return None XF = igzip.IGzipFile(filename=XN) NF = open((NN:=NPATH%FN), 'wb') xl = XF.readline() XCNT = 1 - return xl, xq + if WARC.search(xl): + return xl + else: + NF.write(xl) + if DEBUG: + sys.stderr.write("out_rc\n") -def keys(key): - '''Deal with failure of 2019-35-vintage Java fixup to detect - parameter-part-initial session ids''' - if m:=SESSION.match(key): - prefix=m[1] - e, b = m.span(2) - fixed=key[:e]+key[b:] - if fixed==m[1]: - return True, prefix[:-1], None - else: - return True, prefix, fixed - else: - return False, key, None - -DFQ = [] # for reordering if needed -messyD = False def nextDate(df,dn): - global DEBUG, DFQ, DCNT, ISESSION + global DEBUG, DCNT, XCNT dl = df.readline() if dl == b'': - if DFQ: - if DEBUG: - raise ValueError("EOF but non-empty DFQ: %s"%DFQ) # write out the last of the last index file, if any - return "", "", "", 0, False - if DEBUG>1: + return "", "", "", 0 + if DEBUG: sys.stderr.write("dl%s: %s\n"%(dn,dl)) dkey, ddate, durl, dtime = dl.split(b'\t') - messyD = ISESSION.search(durl) DCNT += 1 - return dkey, ddate, durl, dtime, messyD + return dkey, ddate, durl, dtime with open(sys.argv[1], 'rb') as df: DCNT = 0 - dkey, ddate, durl, dtime, messyD = nextDate(df,1) - xq = [] + dkey, ddate, durl, dtime = nextDate(df,1) - while (nlRes := nextLine(xq, messyD))[0] is not None: - (xl, xq) = nlRes + while (xl := nextLine())[0] is not None: xkey, xdate, xprops = xl.split(b' ', maxsplit=2) m = URL.match(xprops) if m: xurl = m[1] else: raise ValueError("No url in %s"%xprops) - if DEBUG>1: + if DEBUG: sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') for xp in (xkey, xdate, xurl)))) - messyU, xkey1, xkey2 = keys(xkey) - if messyD: - noMatch = (not dkey.startswith(xkey1) or - (xkey2 is not None and dkey!=xkey2)) - if messyU: - # better match - if noMatch: - print("Fail1: md: %s mu: %s\n" - " xkey: %s\n" - " dkey: %s\n" - " xdate: %s\n" - " ddate: %s\n" - " xurl: %s\n" - " durl: %s\n" - "DFQ: %s\n" - "k1, k2: |%s|%s|\n" - "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, - (b'\n '.join(DFQ)).decode('ascii'), - xkey1, xkey2, FN, XCNT, DCNT, xl), - file=sys.stderr) - # fall through to the ordinary (non-messy) match case - else: - # still looking, save if >= date else fall through to write - if DEBUG>1: - print("Diso: match: %s\n" - " xkey: %s\n" - " dkey: %s\n" - " xdate: %s\n" - " ddate: %s\n" - " xurl: %s\n" - " durl: %s\n" - "xl: %s"%(not noMatch, - xkey, dkey, xdate, ddate, xurl, durl, xl), - file=sys.stderr) - if (dkey.startswith(xkey1) and - (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): - xq.append(xl) - if DEBUG>1: - sys.stderr.write('xpush\n') - continue - # else fall through - if (ddate != xdate or - not dkey.startswith(xkey1) or - (xkey2 is not None and dkey!=xkey2) or - durl!=xurl): - if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): - - print("Fail2: md: %s mu: %s\n" + if dkey==xkey and ddate==xdate and durl==xurl: + # Got it + NF.write(xkey) + NF.write(b' ') + NF.write(xdate) + NF.write(b' ') + NF.write(xprops[:-2]) + NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) + if DEBUG: + sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + sys.stderr.write(" %d\n"%int(dtime[:-3])) + + dkey, ddate, durl, dtime = nextDate(df,2) + continue + else: + if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): + # we've missed something, disaster looms + print("Fail2:" " xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" " xurl: %s\n" " durl: %s\n" - "DFQ: %s\n" - "k1, k2: |%s|%s|\n" "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, + "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, - (b'\n '.join(DFQ)).decode('ascii'), - xkey1, xkey2, FN, XCNT, DCNT, xl), + FN, XCNT, DCNT, xl), file=sys.stderr) # try to force recovery - dkey, ddate, durl, dtime, messyD = nextDate(df,3) - NF.write(xl) - if DEBUG>1: - sys.stderr.write("out_nl\n") - continue - # Got it - NF.write(xkey) - NF.write(b' ') - NF.write(xdate) - NF.write(b' ') - NF.write(xprops[:-2]) - NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) - if DEBUG>1: - sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') - for xp in (xkey, xdate, xurl)))) - sys.stderr.write(" %d\n"%int(dtime[:-3])) - - dkey, ddate, durl, dtime, messyD = nextDate(df,2) + dkey, ddate, durl, dtime = nextDate(df,3) + continue + # else fall through to write + NF.write(xl) + if DEBUG: + sys.stderr.write("out_nl\n")