# HG changeset patch # User Henry S. Thompson # Date 1695888078 -3600 # Node ID f8d5e5355c4cb72ddad7d8c1ea7026e05d893200 # Parent 1d1bd22124c0143b3ea6d6e3333b80d5ea24e250 creating lmh package diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh/lmh.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/lmh.py Thu Sep 28 09:01:18 2023 +0100 @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +'''Extract identifying info + LastModified header value for all entries + that have one + + Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' + +import re,warc,sys,glob,codecs + +TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) +DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) +LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) + +DTAB=bytearray(range(256)) +DDEL=b'TZ-:' + +OUT=open(sys.stdout.fileno(),'wb') + +def showmeLMH(wtype,buf,part): + global URI, DATE, SEGMENT, FILETYPE, FILENO + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + if (md:=DPAT.search(buf)): + DATE=md[1] + else: + raise ValueError(b"No date in %s ??"%buf) + else: + mm=LMPAT.search(buf) + OUT.write(URI) + if mm: + OUT.write(b'\t') + OUT.write(DATE.translate(DTAB,DDEL)) + OUT.write(b'\t') + OUT.write(SEGMENT) + OUT.write(b'\t') + OUT.write(FILETYPE) + OUT.write(b'\t') + OUT.write(FILENO) + OUT.write(b'\t') + OUT.write(mm[1]) + OUT.write(b'\n') + +(CCdate, segment, filetype, fileno) = sys.argv[1:] +fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) + +SEGMENT=codecs.encode(segment,'ascii') +FILETYPE=codecs.encode(filetype,'ascii') +FILENO=codecs.encode(fileno,'ascii') + +warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) + diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh/merge_date.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/merge_date.py Thu Sep 28 09:01:18 2023 +0100 @@ -0,0 +1,144 @@ +#!/usr/bin/python3 +'''Add timestamps from Last-Modified-dated (ks.tsv) files into + that year's index + +Usage: merge_date.py ksvstream cdx-dir outdir + +ksvstream consists of tab-separated key, CC date, url and Unix timestamp +''' # ' + +import sys, io, os, os.path, time, re +from isal import igzip + + +DEBUG = 0 +while sys.argv[1] == '-d': + sys.argv.pop(1) + DEBUG += 1 + +XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] +NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] + +RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' +b'(crawldiagnostics|robotstxt)/') +SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' + b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' + b'=[^&]*)') +ISESSION = re.compile(SESSION.pattern,flags=re.I) +URL=re.compile(b'\{"url": "([^"]*)"') +WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') + +# Above based on this from broken Java code: +# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 +#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", + +#print(sys.argv[3],NPATH,file=sys.stderr) + +os.makedirs(sys.argv[3], exist_ok=True) + +FN = 0 + +XCNT = WCNT = 0 +DCNT = 0 + +XF = igzip.IGzipFile(filename=XPATH%0) +NF = open(NN:=(NPATH%0),'wb') + +def nextLine(): + '''Move on to next index file if current has run out''' + global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT + while True: + xl=XF.readline() + XCNT += 1 + if xl == b'': + # need to move to next index file + FN += 1 + XF.close() + NF.close() + print(NN, flush=True) # so we can compress it + print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True) + time.sleep(0.1) # so they flush? + XN=XPATH%FN + if not os.path.exists(XN): + return None + XF = igzip.IGzipFile(filename=XN) + NF = open((NN:=NPATH%FN), 'wb') + xl = XF.readline() + WCNT = XCNT = 1 + if WARC.search(xl): + WCNT += 1 + return xl + else: + NF.write(xl) + if DEBUG: + sys.stderr.write("out_rc\n") + + +def nextDate(df,dn): + global DEBUG, DCNT, XCNT + dl = df.readline() + if dl == b'': + # write out the last of the last index file, if any + return "", "", "", 0 + if DEBUG: + sys.stderr.write("dl%s: %s\n"%(dn,dl)) + dkey, ddate, durl, dtime = dl.split(b'\t') + DCNT += 1 + return dkey, ddate, durl, dtime + +with open(sys.argv[1], 'rb') as df: + DCNT = 0 + + dkey, ddate, durl, dtime = nextDate(df,1) + + while (xl := nextLine()) is not None: + xkey, xdate, xprops = xl.split(b' ', maxsplit=2) + m = URL.match(xprops) + if m: + xurl = m[1] + else: + raise ValueError("No url in %s"%xprops) + if DEBUG: + sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + if dkey==xkey and ddate==xdate and durl==xurl: + # Got it + NF.write(xkey) + NF.write(b' ') + NF.write(xdate) + NF.write(b' ') + NF.write(xprops[:-2]) + NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) + if DEBUG: + sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') + for xp in (xkey, xdate, xurl)))) + sys.stderr.write(" %d\n"%int(dtime[:-3])) + + dkey, ddate, durl, dtime = nextDate(df,2) + continue + else: + if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): + # we've missed something, disaster looms + print("Fail2:" + " xkey: %s\n" + " dkey: %s\n" + " xdate: %s\n" + " ddate: %s\n" + " xurl: %s\n" + " durl: %s\n" + "FN: %s XCNT: %s DCNT: %s\n" + "xl: %s"%(xkey, dkey, xdate, ddate, + xurl, durl, + FN, XCNT, DCNT, xl), + file=sys.stderr) + # try to force recovery + dkey, ddate, durl, dtime = nextDate(df,3) + continue + # else fall through to write + NF.write(xl) + if DEBUG: + sys.stderr.write("out_nl\n") diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh/sort_date.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/sort_date.py Thu Sep 28 09:01:18 2023 +0100 @@ -0,0 +1,142 @@ +#!/usr/bin/python3 +'''Process output of lmh_warc [new 3-column version] + Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") +''' + +# Assumes you have used grep -v $'\t' on input for speed +# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' +# to fix a common 'bad' timestamp (~ .2% of inputs) + +import email.utils +import sys +from urllib.parse import urlsplit, quote, unquote +import surt + +import re, codecs +from itertools import chain + +WPAT = re.compile('(,www\\d*)+\\)') + +# Thanks to https://stackoverflow.com/a/8776871 +import locale +from functools import cmp_to_key + +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('percent',percent_encode) + +def _u_esc(c): + if c<65536: + return '\\u%04X'%c + else: + return '\\U%08X'%c + +def java_unicode_encode(ude): + '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' + return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('java_unicode',java_unicode_encode) + +# From RFC-3986: +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +# / "*" / "+" / "," / ";" / "=" +# But # _is_ escaped in Java surt results +# and additionally " \ : < = > ? \ ^ _ ` { | } are not + +# Note also that although quote already does _not_ quote - . / _ ~ +# they are included below as that's what we find in surt.surt 0.3.1 + +# Also, Java surt strips _all_ leading 'www\d*.', +# where python3 surt only strips the first one. + +# And Java strips so-called option session-ids, but python doesn't + +import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer +import surt.URLRegexTransformer + +ident = ''.join(chr(i) for i in range(256)).encode('latin-1') + +IDMAP=bytes.maketrans(ident,ident) + +# For removal of non-printing characters: +# Note, this is only a guess, only example so are is DEL +NONPRINT= ''.join(chr(i) for i in chain(range(9), + range(14,32), + [127] # DEL + )).encode('latin-1') + +def notDefaultCanon(hu,**options): + if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): + # Try to fix the incompatibility between Java and + # Python surt handling of 'octal' numbers in numeric IPv4 addresses + # and it should! See "After this line: + # + # 15,225,107,143)" in .../azure/notes.txt + try: + bytestrs = hu.host.split(b'.') + hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) + except ValueError: + pass + if hu.query: + hu.query = hu.query.translate(IDMAP,delete=NONPRINT) + return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) + +# Hack this to reproduce the Java bug +surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ + re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), + re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), + ] + +# Above based on this from broken Java code: +# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 +#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", + +def cdx_key(uristring): + _surt = quote(unquote(surt.surt(unquote(uristring), + canonicalizer=notDefaultCanon), + errors='percent'), + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' + ).lower() + # Wrt \x7f (DEL), see discussion in notes wrt + # "biz,televida)" case + # It remains to be seen whether other non-printing bytes + # will need to be treated as 'safe' + return WPAT.sub(')',_surt) + +def keyed(l): + uri, cc_stamp, dateTime = l.split('\t',2) + #print('ul',uri,file=sys.stderr) + try: + try: + epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() + except OverflowError: + epoch = 32535215999.0 + return ((cdx_key(uri), cc_stamp, uri), epoch) + except (TypeError,IndexError,ValueError) as e: + print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) + return + +fstr = sys.argv[1] + +with open(fstr,"r") as ff: + # crucial that the following is done _after_ the file is opened + # with the default (utf-8) locale! + locale.setlocale(locale.LC_ALL, "C") + ctk=cmp_to_key(locale.strcoll) + for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), + key=lambda x:x[0]): + print(key[0],key[1], + key[2].encode('ascii',errors='java_unicode').decode('ascii'), + ts,sep='\t') diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh_warc.py --- a/lib/python/cc/lmh_warc.py Thu Sep 28 08:46:01 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -'''Extract identifying info + LastModified header value for all entries - that have one - - Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' - -import re,warc,sys,glob,codecs - -TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) -DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) -LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) - -DTAB=bytearray(range(256)) -DDEL=b'TZ-:' - -OUT=open(sys.stdout.fileno(),'wb') - -def showmeLMH(wtype,buf,part): - global URI, DATE, SEGMENT, FILETYPE, FILENO - if part==1: - if (m:=TUPAT.search(buf)): - URI=m[1] - else: - raise ValueError(b"No target URI in %s ??"%buf) - if (md:=DPAT.search(buf)): - DATE=md[1] - else: - raise ValueError(b"No date in %s ??"%buf) - else: - mm=LMPAT.search(buf) - OUT.write(URI) - if mm: - OUT.write(b'\t') - OUT.write(DATE.translate(DTAB,DDEL)) - OUT.write(b'\t') - OUT.write(SEGMENT) - OUT.write(b'\t') - OUT.write(FILETYPE) - OUT.write(b'\t') - OUT.write(FILENO) - OUT.write(b'\t') - OUT.write(mm[1]) - OUT.write(b'\n') - -(CCdate, segment, filetype, fileno) = sys.argv[1:] -fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( - CCdate, segment, filetype, fileno) - -SEGMENT=codecs.encode(segment,'ascii') -FILETYPE=codecs.encode(filetype,'ascii') -FILENO=codecs.encode(fileno,'ascii') - -warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) - diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/merge_date.py --- a/lib/python/cc/merge_date.py Thu Sep 28 08:46:01 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,144 +0,0 @@ -#!/usr/bin/python3 -'''Add timestamps from Last-Modified-dated (ks.tsv) files into - that year's index - -Usage: merge_date.py ksvstream cdx-dir outdir - -ksvstream consists of tab-separated key, CC date, url and Unix timestamp -''' # ' - -import sys, io, os, os.path, time, re -from isal import igzip - - -DEBUG = 0 -while sys.argv[1] == '-d': - sys.argv.pop(1) - DEBUG += 1 - -XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] -NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] - -RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' -b'(crawldiagnostics|robotstxt)/') -SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' - b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' - b'=[^&]*)') -ISESSION = re.compile(SESSION.pattern,flags=re.I) -URL=re.compile(b'\{"url": "([^"]*)"') -WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') - -# Above based on this from broken Java code: -# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 -#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), -#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), -#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), -#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), -#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", - -#print(sys.argv[3],NPATH,file=sys.stderr) - -os.makedirs(sys.argv[3], exist_ok=True) - -FN = 0 - -XCNT = WCNT = 0 -DCNT = 0 - -XF = igzip.IGzipFile(filename=XPATH%0) -NF = open(NN:=(NPATH%0),'wb') - -def nextLine(): - '''Move on to next index file if current has run out''' - global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT - while True: - xl=XF.readline() - XCNT += 1 - if xl == b'': - # need to move to next index file - FN += 1 - XF.close() - NF.close() - print(NN, flush=True) # so we can compress it - print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True) - time.sleep(0.1) # so they flush? - XN=XPATH%FN - if not os.path.exists(XN): - return None - XF = igzip.IGzipFile(filename=XN) - NF = open((NN:=NPATH%FN), 'wb') - xl = XF.readline() - WCNT = XCNT = 1 - if WARC.search(xl): - WCNT += 1 - return xl - else: - NF.write(xl) - if DEBUG: - sys.stderr.write("out_rc\n") - - -def nextDate(df,dn): - global DEBUG, DCNT, XCNT - dl = df.readline() - if dl == b'': - # write out the last of the last index file, if any - return "", "", "", 0 - if DEBUG: - sys.stderr.write("dl%s: %s\n"%(dn,dl)) - dkey, ddate, durl, dtime = dl.split(b'\t') - DCNT += 1 - return dkey, ddate, durl, dtime - -with open(sys.argv[1], 'rb') as df: - DCNT = 0 - - dkey, ddate, durl, dtime = nextDate(df,1) - - while (xl := nextLine()) is not None: - xkey, xdate, xprops = xl.split(b' ', maxsplit=2) - m = URL.match(xprops) - if m: - xurl = m[1] - else: - raise ValueError("No url in %s"%xprops) - if DEBUG: - sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') - for xp in (xkey, xdate, xurl)))) - if dkey==xkey and ddate==xdate and durl==xurl: - # Got it - NF.write(xkey) - NF.write(b' ') - NF.write(xdate) - NF.write(b' ') - NF.write(xprops[:-2]) - NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) - if DEBUG: - sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') - for xp in (xkey, xdate, xurl)))) - sys.stderr.write(" %d\n"%int(dtime[:-3])) - - dkey, ddate, durl, dtime = nextDate(df,2) - continue - else: - if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): - # we've missed something, disaster looms - print("Fail2:" - " xkey: %s\n" - " dkey: %s\n" - " xdate: %s\n" - " ddate: %s\n" - " xurl: %s\n" - " durl: %s\n" - "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(xkey, dkey, xdate, ddate, - xurl, durl, - FN, XCNT, DCNT, xl), - file=sys.stderr) - # try to force recovery - dkey, ddate, durl, dtime = nextDate(df,3) - continue - # else fall through to write - NF.write(xl) - if DEBUG: - sys.stderr.write("out_nl\n") diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/sort_date.py --- a/lib/python/cc/sort_date.py Thu Sep 28 08:46:01 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,142 +0,0 @@ -#!/usr/bin/python3 -'''Process output of lmh_warc [new 3-column version] - Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") -''' - -# Assumes you have used grep -v $'\t' on input for speed -# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' -# to fix a common 'bad' timestamp (~ .2% of inputs) - -import email.utils -import sys -from urllib.parse import urlsplit, quote, unquote -import surt - -import re, codecs -from itertools import chain - -WPAT = re.compile('(,www\\d*)+\\)') - -# Thanks to https://stackoverflow.com/a/8776871 -import locale -from functools import cmp_to_key - -def percent_encode(ude): - #print(ude.object,ude.object[ude.start:ude.end]) - return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), - ude.end) - -codecs.register_error('percent',percent_encode) - -def _u_esc(c): - if c<65536: - return '\\u%04X'%c - else: - return '\\U%08X'%c - -def java_unicode_encode(ude): - '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' - return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), - ude.end) - -codecs.register_error('java_unicode',java_unicode_encode) - -# From RFC-3986: -# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" -# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" -# / "*" / "+" / "," / ";" / "=" -# But # _is_ escaped in Java surt results -# and additionally " \ : < = > ? \ ^ _ ` { | } are not - -# Note also that although quote already does _not_ quote - . / _ ~ -# they are included below as that's what we find in surt.surt 0.3.1 - -# Also, Java surt strips _all_ leading 'www\d*.', -# where python3 surt only strips the first one. - -# And Java strips so-called option session-ids, but python doesn't - -import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer -import surt.URLRegexTransformer - -ident = ''.join(chr(i) for i in range(256)).encode('latin-1') - -IDMAP=bytes.maketrans(ident,ident) - -# For removal of non-printing characters: -# Note, this is only a guess, only example so are is DEL -NONPRINT= ''.join(chr(i) for i in chain(range(9), - range(14,32), - [127] # DEL - )).encode('latin-1') - -def notDefaultCanon(hu,**options): - if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): - # Try to fix the incompatibility between Java and - # Python surt handling of 'octal' numbers in numeric IPv4 addresses - # and it should! See "After this line: - # - # 15,225,107,143)" in .../azure/notes.txt - try: - bytestrs = hu.host.split(b'.') - hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) - except ValueError: - pass - if hu.query: - hu.query = hu.query.translate(IDMAP,delete=NONPRINT) - return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) - -# Hack this to reproduce the Java bug -surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ - re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), - re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), - ] - -# Above based on this from broken Java code: -# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 -#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), -#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), -#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), -#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), -#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", - -def cdx_key(uristring): - _surt = quote(unquote(surt.surt(unquote(uristring), - canonicalizer=notDefaultCanon), - errors='percent'), - safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' - ).lower() - # Wrt \x7f (DEL), see discussion in notes wrt - # "biz,televida)" case - # It remains to be seen whether other non-printing bytes - # will need to be treated as 'safe' - return WPAT.sub(')',_surt) - -def keyed(l): - uri, cc_stamp, dateTime = l.split('\t',2) - #print('ul',uri,file=sys.stderr) - try: - try: - epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() - except OverflowError: - epoch = 32535215999.0 - return ((cdx_key(uri), cc_stamp, uri), epoch) - except (TypeError,IndexError,ValueError) as e: - print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) - return - -fstr = sys.argv[1] - -with open(fstr,"r") as ff: - # crucial that the following is done _after_ the file is opened - # with the default (utf-8) locale! - locale.setlocale(locale.LC_ALL, "C") - ctk=cmp_to_key(locale.strcoll) - for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), - key=lambda x:x[0]): - print(key[0],key[1], - key[2].encode('ascii',errors='java_unicode').decode('ascii'), - ts,sep='\t')