+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,31 @@
+'''Turn a merge_nnn.log file into a cluster.idx file
+   We cheat and use the old cluster.idx to save having to read
+   all the cdx-....gz files'''
+import sys
+with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx:
+  i=-1
+  curpos=0
+  target="cdx-00%03d.gz"%i
+  log=open("/dev/null",'r') # embarassing hack
+  for ol in oidx:
+    (surt, datestamp, file, offset, length, cnt) = ol.split()
+    if file!=target:
+      i+=1
+      target="cdx-00%03d.gz"%i
+      log.close()
+      curpos=0
+      log=open('merge_%d.log'%(i+1),'r')
+      hdr=log.readline()
+      (j,f) = hdr.split()
+      sys.stderr.write(hdr)
+      if int(j)!=i+1:
+        raise ValueError("wrong file: i=%s, j=%s"%(i,j))
+    nl=log.readline()
+    if not nl:
+      sys.stderr.write('quiting early: %s\n'%i)
+      exit(1)
+    nlen=int(nl)
+    nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt))
+    curpos+=nlen
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+import warc,sys
+if (debug:=(sys.argv[1]=='-d')):
+  sys.argv.pop(1)
+def countme(wtype,buf,part):
+  if debug:
+    breakpoint()
+  OUT.write(b"%d\n"%len(buf))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,31 @@
+from percent_encode import percent_encode
+from urllib.parse import quote, unquote
+import sys
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+# Note also that quote already does _not_ quote - . / _ ~
+# Also, Java surt strips _all_ leading 'www.',
+#  where python3 surt only strips the first one.
+with open(sys.argv[1],"r") as f:
+  for l in f:
+    while l.endswith(',www',0,ploc:=l.index(')')):
+      l=l[:ploc-4]+l[ploc:]
+    if '%' in l:
+      (key,wt,ts)=l.split('\t')
+      sys.stdout.write(quote(unquote(key,errors='percent'),
+                             safe='!"$&\'()*+,:;<=>?@[\\]^`{|}').lower())
+      sys.stdout.write('\t')
+      sys.stdout.write(wt)
+      sys.stdout.write('\t')
+      sys.stdout.write(ts)
+    else:
+      sys.stdout.write(l)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+'''Extract response records from Common Crawl WARC-format files
+given length, offset and filename triples.
+Input one triple on command line, or
+triples from stdin as tab-delimited lines
+or complete cdx index lines.
+In all cases by 'filename' is meant crawlid/segmentid/type/filename
+Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
+import sys, argparse, regex, os, shutil, io, gzip, time, shlex
+from isal import igzip
+from subprocess import Popen, PIPE
+#import asyncio
+HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
+class HackFormat(argparse.RawDescriptionHelpFormatter):
+  def format_help(self):
+    FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
+    return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
+                          FOO)
+def process(options,buf,filename,offset,length,whole):
+    try:
+      process0(options,buf,filename,offset,length,whole)
+    except Exception as e:
+      if options.debug:
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+      else:
+        print("Process fail: %s, input line:\n %s"%(e,l),
+              file=sys.stderr,end='')
+      exit(3)
+def process0(options,buf,filename,offset,length,whole):
+  if
+    (tf,TMPFILENAME)=tempfile.mkstemp()
+    TMPFILE=open(tf,mode='wb')
+  if options.cmd and not options.process:
+    launch(options.cmd)
+  process1(options,buf,filename,offset,length,whole)
+  if
+    TMPFILE.close()
+    if options.cmd:
+      _output_subproc(bytes(TMPFILENAME,'utf-8'))
+      _output_subproc(b"\n")
+    else:
+      BINOUT.write(bytes(TMPFILENAME,'utf-8'))
+      BINOUT.write(b"\n")
+  if options.cmd:
+    if not options.process:
+      windup(filename,options,length)
+    if
+      os.unlink(TMPFILENAME)
+  elif
+    print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr)
+def launch(cmd):
+  global CMD_PROC, BINOUT
+  CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0)
+def windup(length,offset,filename):
+  # Wind up subproc
+  BINOUT.close()
+  if CMD_PROC.wait()!=0:    # could/should be async?
+    print("subproc of %s:%s:%s failed with %s"%(length,offset,filename,
+                                                CMD_PROC.returncode),
+          file=sys.stderr)
+def _output_tmpfile(buf):
+  TMPFILE.write(buf)
+def _output_stdout(buf):
+  BINOUT.write(buf)
+def _output_subproc(buf):
+  toWrite=len(buf)
+  while toWrite>0:
+    toWrite -= BINOUT.write(buf)    
+def process1(options,buf,filename,offset,length,whole):
+  root=options.root
+  rfn=root+filename
+  if root!="/beegfs/common_crawl":
+    # Support using ramdisk or other local disk as a faster cached
+    if not os.path.exists(rfn):
+      if not os.path.exists(os.path.dirname(rfn)):
+        os.makedirs(os.path.dirname(rfn))
+      with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \
+              io.FileIO(rfn,'w') as outfile:
+        #shutil.copyfileobj(infile,outfile,128*1024*1024)
+        while True:
+          l=infile.readinto(buf)
+          if l==0:
+            break
+          outfile.write(memoryview(buf)[:l])
+  file=open(rfn,'rb',0)
+  bv=memoryview(buf)[:length]
+  nb=file.readinto(bv)
+  file.close()
+  if nb!=length:
+    raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(,
+                                                                  nb,length,offset))
+  if whole and options.zipped:
+    _output(bv)
+    return
+  gzip_chunk = io.BytesIO(bv)
+  uv=memoryview(buf)[length:]
+  with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
+    ll=0
+    while True:
+      l=gzip_fin.readinto(uv)
+      if not l:
+        break
+      ll+=l
+    cb=memoryview(uv)[:ll]
+    if whole:
+      _output(cb)
+      return
+  # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
+  state=0
+  tr=None # Was this record truncated?
+  bl=None # for HTTP Content-Length for the length of the body?
+  # Could we make this faster by working purely within the cb memoryview?
+  # It would be messy, but avoid copying huge amounts
+  # The outer loop would just be something like
+  #   clbv=memoryview(bytearray(b"Content-Length: "))
+  #   i=s=0
+  #   while i<ll:
+  #     if cb[i]==10: # need to handle \r\n
+  #       L=cb[s:i]
+  #       s=i=i+1
+  #       if L[:16]==clbv:
+  #         wl=int(L[16:])
+  #     else:
+  #       i+=1
+  #
+  with io.BytesIO(cb) as clear_text:
+    for L in clear_text:
+      if state==0:
+        # WARC header
+        if L.startswith(b"Content-Length: "):
+          wl=int(L[16:].rstrip())
+        elif L.startswith(b"WARC-Truncated: "):
+          tr=L[16:].rstrip()
+          tr="EMPTY" if tr=="" else tr
+        elif L==b"" or L.startswith(b"\r"): # for idempotency
+          # Blank line, WARC header is finished
+          if not (options.headers or options.body):
+            return
+          state=1
+          # Note we preserve the empty line
+        if options.warc:
+          _output(L)
+        continue
+      if state==1:
+        # HTTP header
+        wl -= len(L)
+        if not (L==b"" or L.startswith(b"\r")):
+          # Non-blank, it's a header
+          (h,_,v)=L.partition(b": ")
+          if bl is None and (h==b"Content-Length"):
+            bl=int(v)
+          if options.headers:
+            if isinstance(options.headers,dict):
+              if h in options.headers:
+                options.headers[h]=v
+            else:
+              _output(L)
+        else:
+          # Blank line, HTTP header is finished
+          if isinstance(options.headers,dict):
+            _output(bytes(str(options.headers),'utf-8'))
+          if not options.body:
+            return
+          if options.headers:
+            _output(L)
+          state=2
+          # The above is just for sanity, because we do _not_
+          #  continue with the outer loop,
+          #  since we can now block-output the entire rest of the
+          #  input buffer.
+          if bl is not None:
+            if bl!=wl:
+              print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                    (length,offset,filename,wl,bl,tr),file=sys.stderr)
+          # HTTP body
+          balance=clear_text.tell()
+          #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+          # Output whatever is left
+          _output(cb[balance:balance+wl])
+          return
+def main():
+  global _output,TMPFILE,TMPFILENAME,tempfile
+  parser = argparse.ArgumentParser(
+    description='''Extract records from warc files given length, offset and file triples.
+  Input one triple on command line, or
+  triples from stdin as tab-delimited lines
+  or complete cdx index lines.
+  In all cases by 'filename' is meant crawlid/segmentid/type/filename''',
+    epilog='''Note that if no output flag(s) is/are given,
+  the whole WARC record will be output, more efficiently than
+  would be the case if all three flags were given.''',
+    add_help=False,
+    conflict_handler='resolve',
+    formatter_class=HackFormat
+    )
+  fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s')
+  parser.add_argument('--help',help='Show help',action='help')
+  parser.add_argument('-d','--debug',help='Debug output',action='store_true')
+  parser.add_argument('-w','--warc',help='output WARC headers',
+                      action='store_true')
+  parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output',
+                      nargs='?',default=None,const=True)
+  parser.add_argument('-b','--body',help='output HTTP body',
+                      action='store_true')
+  parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
+  parser.add_argument('-p','--process',help='with -c, launches CMD only once',
+                      action='store_true')
+  parser.add_argument('-m','--module.function',help='module.function to call with a stream'),
+  parser.add_argument('-s','--save',action='store_true',
+                      help="write to a temporary file and output the name")
+  parser.add_argument('-f','--fpath',
+                      help=fphelp,
+                      default=FPAT)
+  parser.add_argument('-r','--root',nargs='?',
+                  help='File path root, create a copy there if necessary',
+                  default='/beegfs/common_crawl'),
+  parser.add_argument('-z','--zipped',
+                      help="output raw gzipped record, ignored if any of -bhw supplied",
+                      action='store_true')
+  sg=parser.add_mutually_exclusive_group()
+  sg.add_argument('-x','--index',
+                      help='take lines of triples from a cdx index file as input',
+                      action='store_true')
+  sg.add_argument('length',type=int,
+                  help='length in bytes of gzipped record',
+                  nargs='?')
+  parser.add_argument('offset',type=int,
+                      help='start position in bytes of gzipped record',
+                      nargs='?')
+  parser.add_argument('filename',
+                      help='pathname of gzipped Common Crawl WARC-format file',
+                      nargs='?')
+  # Hack the order of optional and positional in the help output
+  parser._action_groups.sort(key=lambda g:g.title)
+  #parser.print_help()
+  pa=parser.parse_args(sys.argv[1:])
+  #print(pa,file=sys.stderr)
+  if pa.length is not None:
+    # We have to enforce our own check..
+    if pa.offset is None or pa.filename is None:
+      parser.error("length, offset and filename must all be supplied together")
+  if isinstance(pa.headers,str):
+    pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(','))
+  buf=bytearray(128*1024*1024)
+  whole=not (pa.warc or pa.headers or pa.body)
+  if
+    _output=_output_tmpfile
+    import tempfile
+  elif pa.cmd:
+    _output = _output_subproc
+  else:
+    _output = _output_stdout
+  if pa.cmd and pa.process:
+      launch(pa.cmd)
+  # three different ways to process
+  if pa.index:
+    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
+    for l in sys.stdin:
+      if m is None:
+        if l.find('/robotstxt/')>-1:
+          continue
+        print("index line problem: \"%s\""%l,file=sys.stderr,end='')
+        exit(2)
+      filename=pa.fpath%(m[3:7])
+      process(pa,buf,filename,
+              int(offset:=m[2]),int(length:=m[1]),whole)
+  elif pa.length is not None:
+    print(pa.filename,file=sys.stderr)
+    process(pa,buf,pa.fpath%tuple(pa.filename.split('/')),
+            pa.offset,pa.length,whole)
+  else:
+    print("Reading length, offset, filename tab-delimited triples from stdin...",
+          file=sys.stderr)
+    for l in sys.stdin:
+      try:
+        (length,offset,filename)=l.rstrip().split('\t')
+        length=int(length)
+        offset=int(offset)
+      except ValueError as e:
+        parser.error('Invalid input line: %s\n "%s"'%(e,l))
+      process(pa,buf,pa.fpath%tuple(filename.split('/')),
+              offset,length,whole)
+  # processing done one way or another
+  if pa.cmd and pa.process:
+    windup(length,offset,filename)
+  # if and pa.process, deleting temp files is down to cmd
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+'''Extract identifying info + LastModified header value for all entries
+   that have one
+   Usage: CC-date segment filetype 3-digit-fileno'''
+import re,warc,sys,glob,codecs
+TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
+LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+def showmeLMH(wtype,buf,part):
+  if part==1:
+    if (
+      URI=m[1]
+    else:
+      raise ValueError(b"No target URI in %s ??"%buf)
+    if (
+      DATE=md[1]
+    else:
+      raise ValueError(b"No date in %s ??"%buf)
+  else:
+    OUT.write(URI)
+    if mm:
+      OUT.write(b'\t')
+      OUT.write(DATE.translate(DTAB,DDEL))
+      OUT.write(b'\t')
+      OUT.write(SEGMENT)
+      OUT.write(b'\t')
+      OUT.write(FILETYPE)
+      OUT.write(b'\t')
+      OUT.write(FILENO)
+      OUT.write(b'\t')
+      OUT.write(mm[1])
+    OUT.write(b'\n')
+(CCdate, segment, filetype, fileno) = sys.argv[1:]
+  CCdate, segment, filetype, fileno)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,144 @@
+'''Add timestamps from Last-Modified-dated (ks.tsv) files into
+   that year's index
+Usage: ksvstream cdx-dir outdir
+ksvstream consists of tab-separated key, CC date, url and Unix timestamp
+''' # '
+import sys, io, os, os.path, time, re
+from isal import igzip
+DEBUG = 0
+while sys.argv[1] == '-d':
+  sys.argv.pop(1)
+  DEBUG += 1  
+XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
+NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
+RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
+SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
+                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
+                     b'=[^&]*)')
+ISESSION = re.compile(SESSION.pattern,flags=re.I)
+URL=re.compile(b'\{"url": "([^"]*)"')
+WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
+# Above based on this from broken Java code:
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+os.makedirs(sys.argv[3], exist_ok=True)
+FN = 0
+XCNT = WCNT = 0
+DCNT = 0
+XF = igzip.IGzipFile(filename=XPATH%0)
+NF = open(NN:=(NPATH%0),'wb')
+def nextLine():
+  '''Move on to next index file if current has run out'''
+  while True:
+    xl=XF.readline()
+    XCNT += 1
+    if xl == b'':
+      # need to move to next index file
+      FN += 1
+      XF.close()
+      NF.close()
+      print(NN, flush=True) # so we can compress it
+      print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True)
+      time.sleep(0.1) # so they flush?
+      XN=XPATH%FN
+      if not os.path.exists(XN):
+        return None
+      XF = igzip.IGzipFile(filename=XN)
+      NF = open((NN:=NPATH%FN), 'wb')
+      xl = XF.readline()
+      WCNT = XCNT = 1
+    if
+      WCNT += 1
+      return xl
+    else:
+      NF.write(xl)
+      if DEBUG:
+        sys.stderr.write("out_rc\n")
+def nextDate(df,dn):
+  global DEBUG, DCNT, XCNT
+  dl = df.readline()
+  if dl == b'':
+    # write out the last of the last index file, if any
+    return "", "", "", 0
+  if DEBUG:
+    sys.stderr.write("dl%s: %s\n"%(dn,dl))
+  dkey, ddate, durl, dtime = dl.split(b'\t')
+  DCNT += 1
+  return dkey, ddate, durl, dtime
+with open(sys.argv[1], 'rb') as df:
+  DCNT = 0
+  dkey, ddate, durl, dtime = nextDate(df,1)
+  while (xl := nextLine()) is not None:
+    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
+    m = URL.match(xprops)
+    if m:
+      xurl = m[1]
+    else:
+      raise ValueError("No url in %s"%xprops)
+    if DEBUG:
+      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
+                                            for xp in (xkey, xdate, xurl))))
+    if dkey==xkey and ddate==xdate and durl==xurl:
+      # Got it
+      NF.write(xkey)
+      NF.write(b' ')
+      NF.write(xdate)
+      NF.write(b' ')
+      NF.write(xprops[:-2])
+      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
+      if DEBUG:
+        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
+                                             for xp in (xkey, xdate, xurl))))
+        sys.stderr.write(" %d\n"%int(dtime[:-3]))
+      dkey, ddate, durl, dtime = nextDate(df,2)
+      continue
+    else:
+      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
+        # we've missed something, disaster looms
+        print("Fail2:"
+               "      xkey: %s\n"
+               "      dkey: %s\n"
+               "      xdate: %s\n"
+               "      ddate: %s\n"
+               "      xurl: %s\n"
+               "      durl: %s\n"
+               "FN: %s XCNT: %s DCNT: %s\n"
+               "xl: %s"%(xkey, dkey, xdate, ddate,
+                         xurl, durl,
+                         FN, XCNT, DCNT, xl),
+              file=sys.stderr)
+        # try to force recovery
+        dkey, ddate, durl, dtime = nextDate(df,3)
+        continue
+      # else fall through to write
+    NF.write(xl)
+    if DEBUG:
+      sys.stderr.write("out_nl\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,36 @@
+'''refactor a per-cdx count table to be per-segment
+input on STDIN
+Usage: per_segment segment-column
+Assumes column 0 is empty, count is in column 1
+Segment column is 0-origin
+import sys
+ss=[dict() for i in range(100)]
+for l in sys.stdin:
+  try:
+    cc=l.split('\t')
+    s=int(cc.pop(c))
+    n=int(cc.pop(1))
+    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
+    #print(s,n,cc,ll,sep='|')
+    #exit(0)
+    t=ss[s].get(ll,0)
+    ss[s][ll]=t+n
+  except:
+    sys.stdout.write(l)
+    print(cc)
+    exit(1)
+# note this won't work if c is last column!
+for s in range(100):
+  with open('s%s.tsv'%s,'w') as f:
+    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
+      f.write(str(c))
+      f.write('\t')
+      f.write(l)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,9 @@
+'''Handle unquoting of non-UTF-8 bytes by %-encoding them'''
+import codecs
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,142 @@
+'''Process output of lmh_warc [new 3-column version]
+   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
+# Assumes you have used grep -v $'\t' on input for speed
+# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
+#  to fix a common 'bad' timestamp (~ .2% of inputs)
+import email.utils
+import sys
+from urllib.parse import urlsplit, quote, unquote
+import surt
+import re, codecs
+from itertools import chain
+WPAT = re.compile('(,www\\d*)+\\)')
+# Thanks to
+import locale
+from functools import cmp_to_key
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
+def _u_esc(c):
+  if c<65536:
+    return '\\u%04X'%c
+  else:
+    return '\\U%08X'%c
+def java_unicode_encode(ude):
+  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
+  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
+          ude.end)
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+# Note also that although quote already does _not_ quote - . / _ ~
+#  they are included below as that's what we find in surt.surt 0.3.1
+# Also, Java surt strips _all_ leading 'www\d*.',
+#  where python3 surt only strips the first one.
+# And Java strips so-called option session-ids, but python doesn't
+import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
+import surt.URLRegexTransformer
+ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
+# For removal of non-printing characters:
+#  Note, this is only a guess, only example so are is DEL
+NONPRINT= ''.join(chr(i) for i in chain(range(9),
+                                      range(14,32),
+                                      [127] # DEL
+                                      )).encode('latin-1')
+def notDefaultCanon(hu,**options):
+  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(
+    # Try to fix the incompatibility between Java and 
+    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
+    #  and it should!  See "After this line:
+    # 
+    # 15,225,107,143)" in .../azure/notes.txt
+    try:
+      bytestrs ='.')
+ = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
+    except ValueError:
+      pass
+  if hu.query:
+    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
+  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
+# Hack this to reproduce the Java bug
+surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
+    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    ]
+# Above based on this from broken Java code:
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+def cdx_key(uristring):
+  _surt = quote(unquote(surt.surt(unquote(uristring),
+                                  canonicalizer=notDefaultCanon),
+                        errors='percent'),
+                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
+                  ).lower()
+                # Wrt \x7f (DEL), see discussion in notes wrt
+                #   "biz,televida)" case
+                # It remains to be seen whether other non-printing bytes
+                #  will need to be treated as 'safe'
+  return WPAT.sub(')',_surt)
+def keyed(l):
+  uri, cc_stamp, dateTime = l.split('\t',2)
+  #print('ul',uri,file=sys.stderr)
+  try:
+    try:
+      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    except OverflowError:
+      epoch = 32535215999.0
+    return ((cdx_key(uri), cc_stamp, uri), epoch)
+  except (TypeError,IndexError,ValueError) as e:
+    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+    return
+fstr = sys.argv[1]
+with open(fstr,"r") as ff:
+  # crucial that the following is done _after_ the file is opened
+  #  with the default (utf-8) locale!
+  locale.setlocale(locale.LC_ALL, "C")
+  ctk=cmp_to_key(locale.strcoll)
+  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+                   key=lambda x:x[0]):
+    print(key[0],key[1],
+          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
+          ts,sep='\t')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+'''Rank correlation processing for a csv tabulation of counts by segment 
+   First column is for whole crawl, then 100 columns for segs 0-99
+   Each row is counts for some property, e.g. mime-detected or tld
+   For example, assuming all.tsv has the whole-crawl warc-only counts
+   and s...tsv have the segment counts, all with counts in column 1,
+   tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+   will produce such a file with
+     * 100 rows, one for each of the top 100 counts
+     * 101 columns, 0 for all and 1--100 for segs 0--99
+   Usage: python3 -i name id
+     where name.csv has the input
+import numpy as np
+from numpy import loadtxt
+from scipy import stats
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+import pylab
+import sys, math
+def qqa():
+  # q-q plot for the whole crawl
+  sm.qqplot(all, line='s')
+  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
+def qqs():
+  # q-q plots for the best and worst (by variance) segments
+  global xv, xworst, xbest
+  xv=[d.variance for d in xd]
+  xworst=xv.index(max(xv))
+  xbest=xv.index(min(xv))
+  print(xbest,xworst)
+  sm.qqplot(x[xbest], line='s')
+  plt.gca().set_title('Best segment (least variance): %s'%xbest)
+  sm.qqplot(x[xworst], line='s')
+  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+def plot_x(sort=False,block=True,all_only=True,title=None):
+  # Make these two subplots, w. and w/o sorting
+  # See
+  #  for legend hacking
+  if sort:
+    aso=np.argsort(-all)
+    plot_all=all[aso]
+    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
+  else:
+    plot_all=all
+    plot_x=[xd[i].mean for i in range(N)]
+  if title is None:
+    l1='Rank correlation of segment x whole crawl'
+    l2='Mean of segment x whole crawl'
+    plt.legend(loc='best',fontsize='small')
+  else:
+    l1=l2=None
+  plt.plot(plot_all,'rx',label=l1)
+  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
+  if not(all_only):
+    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
+    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
+  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
+  plt.grid(True)
+  if title is not None:
+    plt.title(title)
+def hist_x(align='mid'):
+  hist(xm,xsd,[xd[i].mean for i in range(N)],
+       'Mean of rank correlation of each segment x all other segments',
+       align)
+def hist_all(align='mid'):
+  hist(all_m,np.sqrt(all_s.variance),all,
+       'Rank correlation of each segment x whole crawl %s'%id,
+       align)
+def hist(m,sd,hh,title,align):
+  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
+  fig,hax=plt.subplots() # Thanks to
+  sdax=hax.twiny()
+  hax.hist(hh,color='lightblue',align=align)
+  hax.set_title(title)
+  for s,v in sdd:
+       sdax.plot([v,v],[0,18],'b')
+  sdax.set_xlim(hax.get_xlim())
+  sdax.set_ylim(hax.get_ylim())
+  sdax.set_xticks([v for s,v in sdd])
+  sdax.set_xticklabels([str(s) for s,v in sdd])
+def ci(rho,n,conf=0.95):
+  # Courtesy of
+  # rho is (rank) correlation, n is sample size
+  stderr=1.0/math.sqrt(n-3)
+  z=stats.norm.ppf(1.0-((1.0-conf)/2))
+  delta=z*stderr
+  lower=math.tanh(math.atanh(rho)-delta)
+  upper=math.tanh(math.atanh(rho)+delta)
+  return (lower,upper)
+def plot_ci(rhos,n,trim=None,conf=0.95):
+   # rhos are (rank) correlation values
+   rhos_s=rhos[(-rhos).argsort()]
+   if trim is None:
+     l=len(rhos)
+   else:
+     rhos_s=rhos_s[:trim]
+     l=trim
+   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+   ue=cc[1]-rhos_s
+   le=rhos_s-cc[0]
+   #for i in range(len(rhos)):
+     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
+def first_diff(ranks):
+  # first disagreement with baseline == {1,2,...}
+  for i in range(len(ranks)):
+    if ranks[i]!=i+1.0:
+      return i
+  return i+1
+def ranks():
+  # Combine segment measures:
+  #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
+  # convert to ranks, smallest value == highest rank
+  all_ranked=stats.rankdata(-all,method='average') # invert since
+                                                   #  large corr is good
+  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
+                                                  # small corr variance is good
+  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
+                                                   # invert since
+                                                   #  large mean corr is good
+  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
+                                                   # invert since
+                                                   #  large first diff is good
+  return np.array([[i,
+                    all_ranked[i],
+                    x_variance_ranked[i],
+                    x_mean_ranked[i],
+                    fd_ranked[i]] for i in range(N)])
+def main():
+  global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
+  global aa, aa_by_all, N
+  counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
+  id=sys.argv[2]
+  N=counts.shape[1]-1
+  # "If axis=0 (default), then each column represents a variable, with
+  #        observations in the rows"
+  # So each column is a sequence of counts, for whole crawl in column 0
+  #   and for segments 0--N-1 in columns 1--N
+  corr=stats.spearmanr(counts,nan_policy='omit').correlation
+  all=corr[0][1:]
+  all_s=stats.describe(all)
+  all_m=all_s.mean
+  x=np.array([np.concatenate((corr[i][1:i],
+                              corr[i][i+1:])) for i in range(1,N+1)])
+  # The above, although transposed, works because the correlation matrix
+  #  is symmetric
+  xd=[stats.describe(x[i]) for i in range(N)]
+  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
+  xm=xs.mean
+  xsd=np.sqrt(xs.variance)
+  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
+  aa=ranks()
+  aa_by_all=aa[aa[:,1].argsort()]
+### I need to review rows, e.g. counts[0] is an array of N+1 counts
+###   for the most common label in the complete crawl,
+###   from the complete crawl and all the segments
+### versus columns, e.g. counts[:,0] is an array of N decreasing counts
+###   for all the labels in the complete crawl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,33 @@
+import warc,sys
+if (debug:=(sys.argv[1]=='-d')):
+  sys.argv.pop(1)
+def showme(wtype,buf,part):
+  # This should exactly reproduce a complete warc file if called
+  #  as per version 1 below
+  if debug:
+    OUT.write(b"----start %d-----\n"%part)
+  OUT.write(buf)
+  if buf[-1]!=10:
+    OUT.write(b'\r\n')
+  if part==7:
+    OUT.write(b'\r\n') # to match complete file formatting
+  if debug:
+    OUT.write(b"----end %d-----\n"%part)
+  return OUT
+if tt==1:
+  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug)
+elif tt==2:
+  warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug)
+elif tt==3:
+  warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug)
+elif tt==4:
+  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug)
+elif tt==5:
+  warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+'''Stream a warc format file, unzipping if necessary, invoking a
+callback on each record.  Callback can be limited by WARC-Type, record
+import sys,io
+from isal import igzip
+RESP = b'response'
+REQ = b'request'
+META = b'metadata'
+INFO = b'warcinfo'
+def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
+  '''parts is a bit-mask:
+     1 for warc header;
+     2 for req/resp HTTP header, warcinfo/metadata features;
+     4 for req/resp body'''
+  # should do some sanity checking wrt parts and types
+  types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
+  nb=0
+  if filename.endswith(".gz"):
+    stream=igzip.IGzipFile(filename=filename)
+  else:
+    stream=open(filename,'rb',0)
+  bufSize=2*1024*1024
+  hdrMax=16*1024
+  buf=bytearray(bufSize)
+  bufView=memoryview(buf)
+  fpos=bl=stream.readinto(buf)
+  bp=0
+  done=False
+  while True:
+    while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty
+      bp+=2
+    start_1=bp
+    if not buf.startswith(b'WARC/1.0\r\n',bp):
+      if done and bl-bp==0:
+        # really done
+        return
+      raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename,
+                                                                   bp,bl,fpos,
+         (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
+                                                                     bl-bp))
+    bp+=10
+    wtype=None
+    length=None
+    state=1
+    tr=None # Was this record truncated?
+    while not buf.startswith(b'\r\n',bp):
+      # there should always be enough in the buffer to complete this loop,
+      #  because of the buffer update logic below
+      eol=buf.index(b'\r\n',bp)+2
+      if buf.startswith(b"Content-Length: ",bp):
+        length=wl=int(bufView[bp+16:eol-2])
+      elif buf.startswith(b"WARC-Truncated: ",bp):
+        if bp+16==eol-2:
+          tr=b"EMPTY"
+        else:
+          tr=bytes(bufView[bp+16:eol-2])
+      elif buf.startswith(b'WARC-Type: ',bp):
+        if buf.startswith(b's',bp+13):
+          wtype = RESP
+        elif buf.startswith(b'q',bp+13):
+          wtype = REQ
+        elif buf.startswith(b'm',bp+11):
+          wtype = META
+        elif buf.startswith(b'w',bp+11):
+          wtype = INFO
+        else:
+          raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
+                             bytes(bufView[bp+11:eol-2]),filename,
+                             fpos-(bl-bp)))
+      bp=eol
+    bp=eol+2
+    if done:
+      if (bp+length)>bl:
+        raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
+                         length,bl,filename))
+    elif (bp+(length+hdrMax))>bl:
+      # Need more data
+      if wtype in types:
+        # we need to keep from start_1 to bl
+        keepFrom=start_1
+        keepLen=bl-keepFrom
+        buf[0:keepLen]=bufView[keepFrom:bl]
+        eol=eol-start_1
+        start_1=0
+        bp=eol+2
+      else:
+        # we can skip the rest of this part
+        if (bp+length)<=bl:
+          # we have at least some bytes from the next part
+          keepLen=bl-(bp+length)
+          buf[0:keepLen]=bufView[bl-keepLen:bl]
+        else:
+          # we don't have all of the bytes from the current part
+          #  so can skip the rest of it
+          keepLen=0
+        bp=0
+      spaceToFill=bufSize-keepLen
+      with memoryview(buf)[keepLen:bufSize] as xBuf:
+        nb=stream.readinto(xBuf)
+      fpos+=nb
+      bl=keepLen+nb
+      if nb<spaceToFill:
+        done=True
+      if wtype not in types:
+        continue
+    if (wtype in types):
+      # Output whole or part 1 as required
+      if whole:
+        bp+=length
+        OUT=callback(wtype,bufView[start_1:bp],7)
+        continue
+      elif (parts & 1):
+        OUT=callback(wtype,bufView[start_1:eol],1)
+      if parts!=1:
+        while buf.startswith(b'\r\n',bp):
+          bp+=2
+        start_2=bp
+        eob=bp+length
+        while buf.startswith(b'\r\n',eob-2):
+          eob-=2
+        # Only output parts (2 = HTTP header, 4 = body) that are wanted
+        if parts & 2:
+          if wtype is META or wtype is INFO:
+            # rest of the part
+            OUT=callback(wtype,bufView[start_2:eob],2)
+          else:
+            # request and response have http headers
+            eo2=buf.index(b'\r\n\r\n',start_2)
+            OUT=callback(wtype,bufView[start_2:eo2+2],2)
+        if parts & 4:
+          for L in rec_text:
+            if state==2:
+              # HTTP header
+              wl -= len(L)
+              if not (L==b"" or L.startswith(b"\r")):
+                # Non-empty, it's (a continuation of) a header
+                if bl is None and L.startswith(b"Content-Length: "):
+                  bl=int(L[16:].rstrip())
+              else:
+                # Blank line, HTTP header is finished
+                if parts & 2:
+                  callback(wtype,bufView[start_2:start_2+L_start],2)
+                state=4
+                # The above is just for sanity, because we do _not_
+                #  continue with the outer loop,
+                #  since we can now block-output the entire rest of the
+                #  input buffer.
+                if bl is not None:
+                  if bl!=wl:
+                    print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                          (length,offset,filename,wl,bl,tr),file=sys.stderr)
+                # HTTP body
+                balance=start_2+rec_text.tell()
+                #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+                # Output whatever is left
+                if parts & 4:
+                  callback(wtype,bufView[balance:balance+wl],4)
+                state=1
+              L_start=rec_text.tell()
+    bp+=length
+    #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,33 @@
+'''Extract and tabulate runtimes per file from a slurm output log'''
+import sys, re
+from datetime import datetime
+SPAT=re.compile('... (.*) BST start ([0-9]+ [0-9]+)')
+EPAT=re.compile('... (.*) BST end ([0-9]+ [0-9]+)')
+with open(sys.argv[1],'r') as f:
+  for l in f:
+    if m:=SPAT.match(l):
+      b=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p")
+      id=m[2]
+      if id in pending:
+        print('%s started twice at %s, %s'%(id,pending[id],b),file=sys.stderr)
+      else:
+        pending[id]=b
+        if first is None:
+          first=b
+    elif m:=EPAT.match(l):
+      e=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p")
+      id=m[2]
+      if id in pending:
+        delta=(e-pending[id]).seconds
+        print(delta,"%2d:%02d"%(delta/60,delta%60),sep='\t')
+        del pending[id]
+      else:
+        print('%s ended w/o start at %s'%(id,e),file=sys.stderr)
+print('From %s to %s:'%(first.strftime("%d %b %Y %I:%M:%S %p"),
+                        e.strftime("%d %b %Y %I:%M:%S %p")),file=sys.stderr)
+print(' %d:%02d:%02d'%(w/3600,(w/60)%60,w%60),(e-first).seconds,sep='\t',file=sys.stderr)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import sys,pdfx,traceback,os
+from datetime import datetime
+def run(file):
+  try:
+    pdf=pdfx.PDFx(file)
+    links=pdf.get_references_as_dict()
+    if bool(links) and (links.get('scrape',False) or
+                        links.get('annot',False)):
+      for k in links.keys():
+        for l in links[k]:
+          print("%s\t%s"%(k,l))
+    else:
+      print("None")
+  except Exception as e:
+    if str(e)=='Unexpected EOF':
+      print("%s:\t%s"%(,e),file=sys.stderr)
+      print("badpdf")
+    else:
+      print("%s: "%(,end='',file=sys.stderr)
+      traceback.print_exc(file=sys.stderr)
+if sys.argv[1]=='-':
+  i=0
+  for l in sys.stdin:
+    print(i,file=sys.stderr)
+    i+=1
+    f=l.rstrip()
+    if os.path.getsize(f)==1048576: # truncated
+      print("truncated",file=sys.stderr)
+      print("truncated")
+    else:
+      run(f)
+    os.unlink(f)
+  run(sys.argv[1])