# HG changeset patch
# User Henry S. Thompson <ht@inf.ed.ac.uk>
# Date 1695888078 -3600
# Node ID f8d5e5355c4cb72ddad7d8c1ea7026e05d893200
# Parent  1d1bd22124c0143b3ea6d6e3333b80d5ea24e250
creating lmh package

diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh/lmh.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/lmh.py	Thu Sep 28 09:01:18 2023 +0100
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+'''Extract identifying info + LastModified header value for all entries
+   that have one
+
+   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''
+
+import re,warc,sys,glob,codecs
+
+TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
+LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+
+DTAB=bytearray(range(256))
+DDEL=b'TZ-:'
+
+OUT=open(sys.stdout.fileno(),'wb')
+
+def showmeLMH(wtype,buf,part):
+  global URI, DATE, SEGMENT, FILETYPE, FILENO
+  if part==1:
+    if (m:=TUPAT.search(buf)):
+      URI=m[1]
+    else:
+      raise ValueError(b"No target URI in %s ??"%buf)
+    if (md:=DPAT.search(buf)):
+      DATE=md[1]
+    else:
+      raise ValueError(b"No date in %s ??"%buf)
+  else:
+    mm=LMPAT.search(buf)
+    OUT.write(URI)
+    if mm:
+      OUT.write(b'\t')
+      OUT.write(DATE.translate(DTAB,DDEL))
+      OUT.write(b'\t')
+      OUT.write(SEGMENT)
+      OUT.write(b'\t')
+      OUT.write(FILETYPE)
+      OUT.write(b'\t')
+      OUT.write(FILENO)
+      OUT.write(b'\t')
+      OUT.write(mm[1])
+    OUT.write(b'\n')
+
+(CCdate, segment, filetype, fileno) = sys.argv[1:]
+fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
+  CCdate, segment, filetype, fileno)
+
+SEGMENT=codecs.encode(segment,'ascii')
+FILETYPE=codecs.encode(filetype,'ascii')
+FILENO=codecs.encode(fileno,'ascii')
+
+warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
+
diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh/merge_date.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/merge_date.py	Thu Sep 28 09:01:18 2023 +0100
@@ -0,0 +1,144 @@
+#!/usr/bin/python3
+'''Add timestamps from Last-Modified-dated (ks.tsv) files into
+   that year's index
+
+Usage: merge_date.py ksvstream cdx-dir outdir
+
+ksvstream consists of tab-separated key, CC date, url and Unix timestamp
+''' # '
+
+import sys, io, os, os.path, time, re
+from isal import igzip
+
+
+DEBUG = 0
+while sys.argv[1] == '-d':
+  sys.argv.pop(1)
+  DEBUG += 1  
+
+XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
+NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
+
+RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
+b'(crawldiagnostics|robotstxt)/')
+SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
+                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
+                     b'=[^&]*)')
+ISESSION = re.compile(SESSION.pattern,flags=re.I)
+URL=re.compile(b'\{"url": "([^"]*)"')
+WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
+
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
+
+#print(sys.argv[3],NPATH,file=sys.stderr)
+
+os.makedirs(sys.argv[3], exist_ok=True)
+
+FN = 0
+
+XCNT = WCNT = 0
+DCNT = 0
+
+XF = igzip.IGzipFile(filename=XPATH%0)
+NF = open(NN:=(NPATH%0),'wb')
+
+def nextLine():
+  '''Move on to next index file if current has run out'''
+  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT
+  while True:
+    xl=XF.readline()
+    XCNT += 1
+    if xl == b'':
+      # need to move to next index file
+      FN += 1
+      XF.close()
+      NF.close()
+      print(NN, flush=True) # so we can compress it
+      print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True)
+      time.sleep(0.1) # so they flush?
+      XN=XPATH%FN
+      if not os.path.exists(XN):
+        return None
+      XF = igzip.IGzipFile(filename=XN)
+      NF = open((NN:=NPATH%FN), 'wb')
+      xl = XF.readline()
+      WCNT = XCNT = 1
+    if WARC.search(xl):
+      WCNT += 1
+      return xl
+    else:
+      NF.write(xl)
+      if DEBUG:
+        sys.stderr.write("out_rc\n")
+
+
+def nextDate(df,dn):
+  global DEBUG, DCNT, XCNT
+  dl = df.readline()
+  if dl == b'':
+    # write out the last of the last index file, if any
+    return "", "", "", 0
+  if DEBUG:
+    sys.stderr.write("dl%s: %s\n"%(dn,dl))
+  dkey, ddate, durl, dtime = dl.split(b'\t')
+  DCNT += 1
+  return dkey, ddate, durl, dtime
+
+with open(sys.argv[1], 'rb') as df:
+  DCNT = 0
+
+  dkey, ddate, durl, dtime = nextDate(df,1)
+
+  while (xl := nextLine()) is not None:
+    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
+    m = URL.match(xprops)
+    if m:
+      xurl = m[1]
+    else:
+      raise ValueError("No url in %s"%xprops)
+    if DEBUG:
+      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
+                                            for xp in (xkey, xdate, xurl))))
+    if dkey==xkey and ddate==xdate and durl==xurl:
+      # Got it
+      NF.write(xkey)
+      NF.write(b' ')
+      NF.write(xdate)
+      NF.write(b' ')
+      NF.write(xprops[:-2])
+      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
+      if DEBUG:
+        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
+                                             for xp in (xkey, xdate, xurl))))
+        sys.stderr.write(" %d\n"%int(dtime[:-3]))
+
+      dkey, ddate, durl, dtime = nextDate(df,2)
+      continue
+    else:
+      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
+        # we've missed something, disaster looms
+        print("Fail2:"
+               "      xkey: %s\n"
+               "      dkey: %s\n"
+               "      xdate: %s\n"
+               "      ddate: %s\n"
+               "      xurl: %s\n"
+               "      durl: %s\n"
+               "FN: %s XCNT: %s DCNT: %s\n"
+               "xl: %s"%(xkey, dkey, xdate, ddate,
+                         xurl, durl,
+                         FN, XCNT, DCNT, xl),
+              file=sys.stderr)
+        # try to force recovery
+        dkey, ddate, durl, dtime = nextDate(df,3)
+        continue
+      # else fall through to write
+    NF.write(xl)
+    if DEBUG:
+      sys.stderr.write("out_nl\n")
diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh/sort_date.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/sort_date.py	Thu Sep 28 09:01:18 2023 +0100
@@ -0,0 +1,142 @@
+#!/usr/bin/python3
+'''Process output of lmh_warc [new 3-column version]
+   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
+'''
+
+# Assumes you have used grep -v $'\t' on input for speed
+# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
+#  to fix a common 'bad' timestamp (~ .2% of inputs)
+
+import email.utils
+import sys
+from urllib.parse import urlsplit, quote, unquote
+import surt
+
+import re, codecs
+from itertools import chain
+
+WPAT = re.compile('(,www\\d*)+\\)')
+
+# Thanks to https://stackoverflow.com/a/8776871
+import locale
+from functools import cmp_to_key
+
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('percent',percent_encode)
+
+def _u_esc(c):
+  if c<65536:
+    return '\\u%04X'%c
+  else:
+    return '\\U%08X'%c
+
+def java_unicode_encode(ude):
+  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
+  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('java_unicode',java_unicode_encode)
+
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+
+# Note also that although quote already does _not_ quote - . / _ ~
+#  they are included below as that's what we find in surt.surt 0.3.1
+
+# Also, Java surt strips _all_ leading 'www\d*.',
+#  where python3 surt only strips the first one.
+
+# And Java strips so-called option session-ids, but python doesn't
+
+import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
+import surt.URLRegexTransformer
+
+ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
+
+IDMAP=bytes.maketrans(ident,ident)
+
+# For removal of non-printing characters:
+#  Note, this is only a guess, only example so are is DEL
+NONPRINT= ''.join(chr(i) for i in chain(range(9),
+                                      range(14,32),
+                                      [127] # DEL
+                                      )).encode('latin-1')
+
+def notDefaultCanon(hu,**options):
+  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
+    # Try to fix the incompatibility between Java and 
+    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
+    #  and it should!  See "After this line:
+    # 
+    # 15,225,107,143)" in .../azure/notes.txt
+    try:
+      bytestrs = hu.host.split(b'.')
+      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
+    except ValueError:
+      pass
+  if hu.query:
+    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
+  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
+
+# Hack this to reproduce the Java bug
+surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
+    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    ]
+
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
+
+def cdx_key(uristring):
+  _surt = quote(unquote(surt.surt(unquote(uristring),
+                                  canonicalizer=notDefaultCanon),
+                        errors='percent'),
+                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
+                  ).lower()
+                # Wrt \x7f (DEL), see discussion in notes wrt
+                #   "biz,televida)" case
+                # It remains to be seen whether other non-printing bytes
+                #  will need to be treated as 'safe'
+  return WPAT.sub(')',_surt)
+
+def keyed(l):
+  uri, cc_stamp, dateTime = l.split('\t',2)
+  #print('ul',uri,file=sys.stderr)
+  try:
+    try:
+      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    except OverflowError:
+      epoch = 32535215999.0
+    return ((cdx_key(uri), cc_stamp, uri), epoch)
+  except (TypeError,IndexError,ValueError) as e:
+    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+    return
+
+fstr = sys.argv[1]
+
+with open(fstr,"r") as ff:
+  # crucial that the following is done _after_ the file is opened
+  #  with the default (utf-8) locale!
+  locale.setlocale(locale.LC_ALL, "C")
+  ctk=cmp_to_key(locale.strcoll)
+  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+                   key=lambda x:x[0]):
+    print(key[0],key[1],
+          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
+          ts,sep='\t')
diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/lmh_warc.py
--- a/lib/python/cc/lmh_warc.py	Thu Sep 28 08:46:01 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-'''Extract identifying info + LastModified header value for all entries
-   that have one
-
-   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''
-
-import re,warc,sys,glob,codecs
-
-TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
-DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
-LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
-
-DTAB=bytearray(range(256))
-DDEL=b'TZ-:'
-
-OUT=open(sys.stdout.fileno(),'wb')
-
-def showmeLMH(wtype,buf,part):
-  global URI, DATE, SEGMENT, FILETYPE, FILENO
-  if part==1:
-    if (m:=TUPAT.search(buf)):
-      URI=m[1]
-    else:
-      raise ValueError(b"No target URI in %s ??"%buf)
-    if (md:=DPAT.search(buf)):
-      DATE=md[1]
-    else:
-      raise ValueError(b"No date in %s ??"%buf)
-  else:
-    mm=LMPAT.search(buf)
-    OUT.write(URI)
-    if mm:
-      OUT.write(b'\t')
-      OUT.write(DATE.translate(DTAB,DDEL))
-      OUT.write(b'\t')
-      OUT.write(SEGMENT)
-      OUT.write(b'\t')
-      OUT.write(FILETYPE)
-      OUT.write(b'\t')
-      OUT.write(FILENO)
-      OUT.write(b'\t')
-      OUT.write(mm[1])
-    OUT.write(b'\n')
-
-(CCdate, segment, filetype, fileno) = sys.argv[1:]
-fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
-  CCdate, segment, filetype, fileno)
-
-SEGMENT=codecs.encode(segment,'ascii')
-FILETYPE=codecs.encode(filetype,'ascii')
-FILENO=codecs.encode(fileno,'ascii')
-
-warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
-
diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/merge_date.py
--- a/lib/python/cc/merge_date.py	Thu Sep 28 08:46:01 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,144 +0,0 @@
-#!/usr/bin/python3
-'''Add timestamps from Last-Modified-dated (ks.tsv) files into
-   that year's index
-
-Usage: merge_date.py ksvstream cdx-dir outdir
-
-ksvstream consists of tab-separated key, CC date, url and Unix timestamp
-''' # '
-
-import sys, io, os, os.path, time, re
-from isal import igzip
-
-
-DEBUG = 0
-while sys.argv[1] == '-d':
-  sys.argv.pop(1)
-  DEBUG += 1  
-
-XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
-NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
-
-RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
-b'(crawldiagnostics|robotstxt)/')
-SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
-                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
-                     b'=[^&]*)')
-ISESSION = re.compile(SESSION.pattern,flags=re.I)
-URL=re.compile(b'\{"url": "([^"]*)"')
-WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
-
-# Above based on this from broken Java code:
-# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
-#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
-#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
-#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
-#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
-#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
-
-#print(sys.argv[3],NPATH,file=sys.stderr)
-
-os.makedirs(sys.argv[3], exist_ok=True)
-
-FN = 0
-
-XCNT = WCNT = 0
-DCNT = 0
-
-XF = igzip.IGzipFile(filename=XPATH%0)
-NF = open(NN:=(NPATH%0),'wb')
-
-def nextLine():
-  '''Move on to next index file if current has run out'''
-  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT
-  while True:
-    xl=XF.readline()
-    XCNT += 1
-    if xl == b'':
-      # need to move to next index file
-      FN += 1
-      XF.close()
-      NF.close()
-      print(NN, flush=True) # so we can compress it
-      print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True)
-      time.sleep(0.1) # so they flush?
-      XN=XPATH%FN
-      if not os.path.exists(XN):
-        return None
-      XF = igzip.IGzipFile(filename=XN)
-      NF = open((NN:=NPATH%FN), 'wb')
-      xl = XF.readline()
-      WCNT = XCNT = 1
-    if WARC.search(xl):
-      WCNT += 1
-      return xl
-    else:
-      NF.write(xl)
-      if DEBUG:
-        sys.stderr.write("out_rc\n")
-
-
-def nextDate(df,dn):
-  global DEBUG, DCNT, XCNT
-  dl = df.readline()
-  if dl == b'':
-    # write out the last of the last index file, if any
-    return "", "", "", 0
-  if DEBUG:
-    sys.stderr.write("dl%s: %s\n"%(dn,dl))
-  dkey, ddate, durl, dtime = dl.split(b'\t')
-  DCNT += 1
-  return dkey, ddate, durl, dtime
-
-with open(sys.argv[1], 'rb') as df:
-  DCNT = 0
-
-  dkey, ddate, durl, dtime = nextDate(df,1)
-
-  while (xl := nextLine()) is not None:
-    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
-    m = URL.match(xprops)
-    if m:
-      xurl = m[1]
-    else:
-      raise ValueError("No url in %s"%xprops)
-    if DEBUG:
-      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
-                                            for xp in (xkey, xdate, xurl))))
-    if dkey==xkey and ddate==xdate and durl==xurl:
-      # Got it
-      NF.write(xkey)
-      NF.write(b' ')
-      NF.write(xdate)
-      NF.write(b' ')
-      NF.write(xprops[:-2])
-      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
-      if DEBUG:
-        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
-                                             for xp in (xkey, xdate, xurl))))
-        sys.stderr.write(" %d\n"%int(dtime[:-3]))
-
-      dkey, ddate, durl, dtime = nextDate(df,2)
-      continue
-    else:
-      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
-        # we've missed something, disaster looms
-        print("Fail2:"
-               "      xkey: %s\n"
-               "      dkey: %s\n"
-               "      xdate: %s\n"
-               "      ddate: %s\n"
-               "      xurl: %s\n"
-               "      durl: %s\n"
-               "FN: %s XCNT: %s DCNT: %s\n"
-               "xl: %s"%(xkey, dkey, xdate, ddate,
-                         xurl, durl,
-                         FN, XCNT, DCNT, xl),
-              file=sys.stderr)
-        # try to force recovery
-        dkey, ddate, durl, dtime = nextDate(df,3)
-        continue
-      # else fall through to write
-    NF.write(xl)
-    if DEBUG:
-      sys.stderr.write("out_nl\n")
diff -r 1d1bd22124c0 -r f8d5e5355c4c lib/python/cc/sort_date.py
--- a/lib/python/cc/sort_date.py	Thu Sep 28 08:46:01 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,142 +0,0 @@
-#!/usr/bin/python3
-'''Process output of lmh_warc [new 3-column version]
-   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
-'''
-
-# Assumes you have used grep -v $'\t' on input for speed
-# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
-#  to fix a common 'bad' timestamp (~ .2% of inputs)
-
-import email.utils
-import sys
-from urllib.parse import urlsplit, quote, unquote
-import surt
-
-import re, codecs
-from itertools import chain
-
-WPAT = re.compile('(,www\\d*)+\\)')
-
-# Thanks to https://stackoverflow.com/a/8776871
-import locale
-from functools import cmp_to_key
-
-def percent_encode(ude):
-  #print(ude.object,ude.object[ude.start:ude.end])
-  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
-          ude.end)
-
-codecs.register_error('percent',percent_encode)
-
-def _u_esc(c):
-  if c<65536:
-    return '\\u%04X'%c
-  else:
-    return '\\U%08X'%c
-
-def java_unicode_encode(ude):
-  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
-  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
-          ude.end)
-
-codecs.register_error('java_unicode',java_unicode_encode)
-
-# From RFC-3986:
-# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
-# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
-#                / "*" / "+" / "," / ";" / "="
-# But # _is_ escaped in Java surt results
-#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
-
-# Note also that although quote already does _not_ quote - . / _ ~
-#  they are included below as that's what we find in surt.surt 0.3.1
-
-# Also, Java surt strips _all_ leading 'www\d*.',
-#  where python3 surt only strips the first one.
-
-# And Java strips so-called option session-ids, but python doesn't
-
-import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
-import surt.URLRegexTransformer
-
-ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
-
-IDMAP=bytes.maketrans(ident,ident)
-
-# For removal of non-printing characters:
-#  Note, this is only a guess, only example so are is DEL
-NONPRINT= ''.join(chr(i) for i in chain(range(9),
-                                      range(14,32),
-                                      [127] # DEL
-                                      )).encode('latin-1')
-
-def notDefaultCanon(hu,**options):
-  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
-    # Try to fix the incompatibility between Java and 
-    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
-    #  and it should!  See "After this line:
-    # 
-    # 15,225,107,143)" in .../azure/notes.txt
-    try:
-      bytestrs = hu.host.split(b'.')
-      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
-    except ValueError:
-      pass
-  if hu.query:
-    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
-  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
-
-# Hack this to reproduce the Java bug
-surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
-    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
-    ]
-
-# Above based on this from broken Java code:
-# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
-#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
-#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
-#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
-#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
-#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
-
-def cdx_key(uristring):
-  _surt = quote(unquote(surt.surt(unquote(uristring),
-                                  canonicalizer=notDefaultCanon),
-                        errors='percent'),
-                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
-                  ).lower()
-                # Wrt \x7f (DEL), see discussion in notes wrt
-                #   "biz,televida)" case
-                # It remains to be seen whether other non-printing bytes
-                #  will need to be treated as 'safe'
-  return WPAT.sub(')',_surt)
-
-def keyed(l):
-  uri, cc_stamp, dateTime = l.split('\t',2)
-  #print('ul',uri,file=sys.stderr)
-  try:
-    try:
-      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
-    except OverflowError:
-      epoch = 32535215999.0
-    return ((cdx_key(uri), cc_stamp, uri), epoch)
-  except (TypeError,IndexError,ValueError) as e:
-    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
-    return
-
-fstr = sys.argv[1]
-
-with open(fstr,"r") as ff:
-  # crucial that the following is done _after_ the file is opened
-  #  with the default (utf-8) locale!
-  locale.setlocale(locale.LC_ALL, "C")
-  ctk=cmp_to_key(locale.strcoll)
-  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
-                   key=lambda x:x[0]):
-    print(key[0],key[1],
-          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
-          ts,sep='\t')