Mercurial > hg > cc > cirrus_work
view bin/sort_date.py @ 118:9d14e7c32737
replicate two extremely-corner cases of the way
Java produces surts for URIs containin escaped DEL chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 27 Sep 2023 17:29:09 +0100 |
parents | 827eadc72122 |
children |
line wrap: on
line source
#!/usr/bin/python3 '''Process output of lmh_warc [new 3-column version] Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") ''' # Assumes you have used grep -v $'\t' on input for speed # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' # to fix a common 'bad' timestamp (~ .2% of inputs) import email.utils import sys from urllib.parse import urlsplit, quote, unquote import surt import re, codecs from itertools import chain WPAT = re.compile('(,www\\d*)+\\)') # Thanks to https://stackoverflow.com/a/8776871 import locale from functools import cmp_to_key def percent_encode(ude): #print(ude.object,ude.object[ude.start:ude.end]) return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), ude.end) codecs.register_error('percent',percent_encode) def _u_esc(c): if c<65536: return '\\u%04X'%c else: return '\\U%08X'%c def java_unicode_encode(ude): '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), ude.end) codecs.register_error('java_unicode',java_unicode_encode) # From RFC-3986: # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" # / "*" / "+" / "," / ";" / "=" # But # _is_ escaped in Java surt results # and additionally " \ : < = > ? \ ^ _ ` { | } are not # Note also that although quote already does _not_ quote - . / _ ~ # they are included below as that's what we find in surt.surt 0.3.1 # Also, Java surt strips _all_ leading 'www\d*.', # where python3 surt only strips the first one. # And Java strips so-called option session-ids, but python doesn't import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer import surt.URLRegexTransformer ident = ''.join(chr(i) for i in range(256)).encode('latin-1') IDMAP=bytes.maketrans(ident,ident) # For removal of non-printing characters: # Note, this is only a guess, only example so are is DEL NONPRINT= ''.join(chr(i) for i in chain(range(9), range(14,32), [127] # DEL )).encode('latin-1') def notDefaultCanon(hu,**options): if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): # Try to fix the incompatibility between Java and # Python surt handling of 'octal' numbers in numeric IPv4 addresses # and it should! See "After this line: # # 15,225,107,143)" in .../azure/notes.txt try: bytestrs = hu.host.split(b'.') hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) except ValueError: pass if hu.query: hu.query = hu.query.translate(IDMAP,delete=NONPRINT) return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) # Hack this to reproduce the Java bug surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), ] # Above based on this from broken Java code: # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", def cdx_key(uristring): _surt = quote(unquote(surt.surt(unquote(uristring), canonicalizer=notDefaultCanon), errors='percent'), safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' ).lower() # Wrt \x7f (DEL), see discussion in notes wrt # "biz,televida)" case # It remains to be seen whether other non-printing bytes # will need to be treated as 'safe' return WPAT.sub(')',_surt) def keyed(l): uri, cc_stamp, dateTime = l.split('\t',2) #print('ul',uri,file=sys.stderr) try: try: epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() except OverflowError: epoch = 32535215999.0 return ((cdx_key(uri), cc_stamp, uri), epoch) except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return fstr = sys.argv[1] with open(fstr,"r") as ff: # crucial that the following is done _after_ the file is opened # with the default (utf-8) locale! locale.setlocale(locale.LC_ALL, "C") ctk=cmp_to_key(locale.strcoll) for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), key=lambda x:x[0]): print(key[0],key[1], key[2].encode('ascii',errors='java_unicode').decode('ascii'), ts,sep='\t')