Mercurial > hg > cc > cirrus_work

#!/usr/bin/python3
'''Process output of lmh_warc [new 3-column version]
   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
'''

# Assumes you have used grep -v $'\t' on input for speed
# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
#  to fix a common 'bad' timestamp (~ .2% of inputs)

import email.utils
import sys
from urllib.parse import urlsplit, quote, unquote
import surt

import re, codecs
from itertools import chain

WPAT = re.compile('(,www\\d*)+\\)')

# Thanks to https://stackoverflow.com/a/8776871
import locale
from functools import cmp_to_key

def percent_encode(ude):
  #print(ude.object,ude.object[ude.start:ude.end])
  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
          ude.end)

codecs.register_error('percent',percent_encode)

def _u_esc(c):
  if c<65536:
    return '\\u%04X'%c
  else:
    return '\\U%08X'%c

def java_unicode_encode(ude):
  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
          ude.end)

codecs.register_error('java_unicode',java_unicode_encode)

# From RFC-3986:
# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
#                / "*" / "+" / "," / ";" / "="
# But # _is_ escaped in Java surt results
#  and additionally " \ : < = > ? \ ^  _ ` { | } are not

# Note also that although quote already does _not_ quote - . / _ ~
#  they are included below as that's what we find in surt.surt 0.3.1

# Also, Java surt strips _all_ leading 'www\d*.',
#  where python3 surt only strips the first one.

# And Java strips so-called option session-ids, but python doesn't

import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
import surt.URLRegexTransformer

ident = ''.join(chr(i) for i in range(256)).encode('latin-1')

IDMAP=bytes.maketrans(ident,ident)

# For removal of non-printing characters:
#  Note, this is only a guess, only example so are is DEL
NONPRINT= ''.join(chr(i) for i in chain(range(9),
                                      range(14,32),
                                      [127] # DEL
                                      )).encode('latin-1')

def notDefaultCanon(hu,**options):
  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
    # Try to fix the incompatibility between Java and
    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
    #  and it should!  See "After this line:
    #
    # 15,225,107,143)" in .../azure/notes.txt
    try:
      bytestrs = hu.host.split(b'.')
      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
    except ValueError:
      pass
  if hu.query:
    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)

# Hack this to reproduce the Java bug
surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
    ]

# Above based on this from broken Java code:
# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",

def cdx_key(uristring):
  _surt = quote(unquote(surt.surt(unquote(uristring),
                                  canonicalizer=notDefaultCanon),
                        errors='percent'),
                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
                  ).lower()
                # Wrt \x7f (DEL), see discussion in notes wrt
                #   "biz,televida)" case
                # It remains to be seen whether other non-printing bytes
                #  will need to be treated as 'safe'
  return WPAT.sub(')',_surt)

def keyed(l):
  uri, cc_stamp, dateTime = l.split('\t',2)
  #print('ul',uri,file=sys.stderr)
  try:
    try:
      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
    except OverflowError:
      epoch = 32535215999.0
    return ((cdx_key(uri), cc_stamp, uri), epoch)
  except (TypeError,IndexError,ValueError) as e:
    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
    return

fstr = sys.argv[1]

with open(fstr,"r") as ff:
  # crucial that the following is done _after_ the file is opened
  #  with the default (utf-8) locale!
  locale.setlocale(locale.LC_ALL, "C")
  ctk=cmp_to_key(locale.strcoll)
  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                   key=lambda x:x[0]):
    print(key[0],key[1],
          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
          ts,sep='\t')
author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Wed, 27 Sep 2023 17:29:09 +0100
parents	827eadc72122
children