changeset 120:1d1bd22124c0

moved from bin
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Sep 2023 08:46:01 +0100
parents 1d12b51c4d59
children f8d5e5355c4c
files bin/build_idx.py bin/count_warc.py bin/deltas.py bin/fix_key.py bin/ix.py bin/lmh_warc.py bin/merge_date.py bin/per_segment.py bin/percent_encode.py bin/plinks.py bin/sort_date.py bin/spearman.py bin/test_warc.py bin/warc.py lib/python/cc/build_idx.py lib/python/cc/count_warc.py lib/python/cc/fix_key.py lib/python/cc/ix.py lib/python/cc/lmh_warc.py lib/python/cc/merge_date.py lib/python/cc/per_segment.py lib/python/cc/percent_encode.py lib/python/cc/sort_date.py lib/python/cc/spearman.py lib/python/cc/test_warc.py lib/python/cc/warc.py lib/python/deltas.py lib/python/plinks.py
diffstat 28 files changed, 1237 insertions(+), 1237 deletions(-) [+]
line wrap: on
line diff
--- a/bin/build_idx.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-#!/usr/bin/python3
-'''Turn a merge_nnn.log file into a cluster.idx file
-   We cheat and use the old cluster.idx to save having to read
-   all the cdx-....gz files'''
-import sys
-
-with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx:
-  i=-1
-  curpos=0
-  target="cdx-00%03d.gz"%i
-  log=open("/dev/null",'r') # embarassing hack
-  for ol in oidx:
-    (surt, datestamp, file, offset, length, cnt) = ol.split()
-    if file!=target:
-      i+=1
-      target="cdx-00%03d.gz"%i
-      log.close()
-      curpos=0
-      log=open('merge_%d.log'%(i+1),'r')
-      hdr=log.readline()
-      (j,f) = hdr.split()
-      sys.stderr.write(hdr)
-      if int(j)!=i+1:
-        raise ValueError("wrong file: i=%s, j=%s"%(i,j))
-    nl=log.readline()
-    if not nl:
-      sys.stderr.write('quiting early: %s\n'%i)
-      exit(1)
-    nlen=int(nl)
-    nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt))
-    curpos+=nlen
--- a/bin/count_warc.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-#!/usr/bin/env python3
-import warc,sys
-
-OUT=open(sys.stdout.fileno(),'wb')
-
-if (debug:=(sys.argv[1]=='-d')):
-  sys.argv.pop(1)
-
-def countme(wtype,buf,part):
-  if debug:
-    breakpoint()
-  OUT.write(b"%d\n"%len(buf))
-
-#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
-#warc(showme,[b'response'],whole=True)
-
-warc.warc(sys.argv[1],countme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- a/bin/deltas.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-#!/usr/bin/python3
-'''Extract and tabulate runtimes per file from a slurm output log'''
-import sys, re
-from datetime import datetime
-pending={}
-first=None
-SPAT=re.compile('... (.*) BST start ([0-9]+ [0-9]+)')
-EPAT=re.compile('... (.*) BST end ([0-9]+ [0-9]+)')
-with open(sys.argv[1],'r') as f:
-  for l in f:
-    if m:=SPAT.match(l):
-      b=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p")
-      id=m[2]
-      if id in pending:
-        print('%s started twice at %s, %s'%(id,pending[id],b),file=sys.stderr)
-      else:
-        pending[id]=b
-        if first is None:
-          first=b
-    elif m:=EPAT.match(l):
-      e=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p")
-      id=m[2]
-      if id in pending:
-        delta=(e-pending[id]).seconds
-        print(delta,"%2d:%02d"%(delta/60,delta%60),sep='\t')
-        del pending[id]
-      else:
-        print('%s ended w/o start at %s'%(id,e),file=sys.stderr)
-w=(e-first).seconds
-sys.stdout.flush()
-print('From %s to %s:'%(first.strftime("%d %b %Y %I:%M:%S %p"),
-                        e.strftime("%d %b %Y %I:%M:%S %p")),file=sys.stderr)
-print(' %d:%02d:%02d'%(w/3600,(w/60)%60,w%60),(e-first).seconds,sep='\t',file=sys.stderr)
--- a/bin/fix_key.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-#!/usr/bin/python3
-from percent_encode import percent_encode
-from urllib.parse import quote, unquote
-import sys
-
-# From RFC-3986:
-# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
-# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
-#                / "*" / "+" / "," / ";" / "="
-# But # _is_ escaped in Java surt results
-#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
-# Note also that quote already does _not_ quote - . / _ ~
-
-# Also, Java surt strips _all_ leading 'www.',
-#  where python3 surt only strips the first one.
-
-with open(sys.argv[1],"r") as f:
-  for l in f:
-    while l.endswith(',www',0,ploc:=l.index(')')):
-      l=l[:ploc-4]+l[ploc:]
-    if '%' in l:
-      (key,wt,ts)=l.split('\t')
-      sys.stdout.write(quote(unquote(key,errors='percent'),
-                             safe='!"$&\'()*+,:;<=>?@[\\]^`{|}').lower())
-      sys.stdout.write('\t')
-      sys.stdout.write(wt)
-      sys.stdout.write('\t')
-      sys.stdout.write(ts)
-    else:
-      sys.stdout.write(l)
-
--- a/bin/ix.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,315 +0,0 @@
-#!/usr/bin/env python3
-'''Extract response records from Common Crawl WARC-format files
-given length, offset and filename triples.
-Input one triple on command line, or
-triples from stdin as tab-delimited lines
-or complete cdx index lines.
-In all cases by 'filename' is meant crawlid/segmentid/type/filename
-
-Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
-
-import sys, argparse, regex, os, shutil, io, gzip, time, shlex
-from isal import igzip
-from subprocess import Popen, PIPE
-#import asyncio
-
-HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
-BINOUT=sys.stdout.buffer
-FPAT="/%s/%s/orig/%s/%s"
-
-CMD_PROC=None
-TMPFILENAME=None
-
-class HackFormat(argparse.RawDescriptionHelpFormatter):
-  def format_help(self):
-    FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
-    return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
-                          FOO)
-
-def process(options,buf,filename,offset,length,whole):
-    try:
-      process0(options,buf,filename,offset,length,whole)
-    except Exception as e:
-      if options.debug:
-        import traceback
-        traceback.print_exc(file=sys.stderr)
-      else:
-        print("Process fail: %s, input line:\n %s"%(e,l),
-              file=sys.stderr,end='')
-      exit(3)
-
-def process0(options,buf,filename,offset,length,whole):
-  global TMPFILENAME, TMPFILE
-  if options.save:
-    (tf,TMPFILENAME)=tempfile.mkstemp()
-    TMPFILE=open(tf,mode='wb')
-  if options.cmd and not options.process:
-    launch(options.cmd)
-  process1(options,buf,filename,offset,length,whole)
-  if options.save:
-    TMPFILE.close()
-    if options.cmd:
-      _output_subproc(bytes(TMPFILENAME,'utf-8'))
-      _output_subproc(b"\n")
-    else:
-      BINOUT.write(bytes(TMPFILENAME,'utf-8'))
-      BINOUT.write(b"\n")
-  if options.cmd:
-    if not options.process:
-      windup(filename,options,length)
-    if options.save:
-      os.unlink(TMPFILENAME)
-      TMPFILENAME=None
-  elif options.save:
-    print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr)
-    TMPFILENAME=None
-
-def launch(cmd):
-  global CMD_PROC, BINOUT
-  CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0)
-  BINOUT=CMD_PROC.stdin
-
-def windup(length,offset,filename):
-  # Wind up subproc
-  BINOUT.close()
-  if CMD_PROC.wait()!=0:    # could/should be async?
-    print("subproc of %s:%s:%s failed with %s"%(length,offset,filename,
-                                                CMD_PROC.returncode),
-          file=sys.stderr)
-  
-def _output_tmpfile(buf):
-  TMPFILE.write(buf)
-
-def _output_stdout(buf):
-  BINOUT.write(buf)
-
-def _output_subproc(buf):
-  toWrite=len(buf)
-  while toWrite>0:
-    toWrite -= BINOUT.write(buf)    
-
-def process1(options,buf,filename,offset,length,whole):
-  root=options.root
-  rfn=root+filename
-  if root!="/beegfs/common_crawl":
-    # Support using ramdisk or other local disk as a faster cached
-    if not os.path.exists(rfn):
-      if not os.path.exists(os.path.dirname(rfn)):
-        os.makedirs(os.path.dirname(rfn))
-      with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \
-              io.FileIO(rfn,'w') as outfile:
-        #shutil.copyfileobj(infile,outfile,128*1024*1024)
-        while True:
-          l=infile.readinto(buf)
-          if l==0:
-            break
-          outfile.write(memoryview(buf)[:l])
-  file=open(rfn,'rb',0)
-  file.seek(offset)
-  bv=memoryview(buf)[:length]
-  nb=file.readinto(bv)
-  file.close()
-  if nb!=length:
-    raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name,
-                                                                  nb,length,offset))
-  if whole and options.zipped:
-    _output(bv)
-    return
-  gzip_chunk = io.BytesIO(bv)
-  uv=memoryview(buf)[length:]
-  with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
-    ll=0
-    while True:
-      l=gzip_fin.readinto(uv)
-      if not l:
-        break
-      ll+=l
-    cb=memoryview(uv)[:ll]
-    if whole:
-      _output(cb)
-      return
-  # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
-  state=0
-  tr=None # Was this record truncated?
-  bl=None # for HTTP Content-Length for the length of the body?
-  # Could we make this faster by working purely within the cb memoryview?
-  # It would be messy, but avoid copying huge amounts
-  # The outer loop would just be something like
-  #   clbv=memoryview(bytearray(b"Content-Length: "))
-  #   i=s=0
-  #   while i<ll:
-  #     if cb[i]==10: # need to handle \r\n
-  #       L=cb[s:i]
-  #       s=i=i+1
-  #       if L[:16]==clbv:
-  #         wl=int(L[16:])
-  #     else:
-  #       i+=1
-  #
-  with io.BytesIO(cb) as clear_text:
-    for L in clear_text:
-      if state==0:
-        # WARC header
-        if L.startswith(b"Content-Length: "):
-          wl=int(L[16:].rstrip())
-        elif L.startswith(b"WARC-Truncated: "):
-          tr=L[16:].rstrip()
-          tr="EMPTY" if tr=="" else tr
-        elif L==b"" or L.startswith(b"\r"): # for idempotency
-          # Blank line, WARC header is finished
-          if not (options.headers or options.body):
-            return
-          state=1
-          # Note we preserve the empty line
-        if options.warc:
-          _output(L)
-        continue
-      if state==1:
-        # HTTP header
-        wl -= len(L)
-        if not (L==b"" or L.startswith(b"\r")):
-          # Non-blank, it's a header
-          (h,_,v)=L.partition(b": ")
-          if bl is None and (h==b"Content-Length"):
-            bl=int(v)
-          if options.headers:
-            if isinstance(options.headers,dict):
-              if h in options.headers:
-                options.headers[h]=v
-            else:
-              _output(L)
-        else:
-          # Blank line, HTTP header is finished
-          if isinstance(options.headers,dict):
-            _output(bytes(str(options.headers),'utf-8'))
-          if not options.body:
-            return
-          if options.headers:
-            _output(L)
-          state=2
-          # The above is just for sanity, because we do _not_
-          #  continue with the outer loop,
-          #  since we can now block-output the entire rest of the
-          #  input buffer.
-          if bl is not None:
-            if bl!=wl:
-              print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
-                    (length,offset,filename,wl,bl,tr),file=sys.stderr)
-          # HTTP body
-          balance=clear_text.tell()
-          #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
-          # Output whatever is left
-          _output(cb[balance:balance+wl])
-          return
-
-def main():
-  global _output,TMPFILE,TMPFILENAME,tempfile
-  parser = argparse.ArgumentParser(
-    description='''Extract records from warc files given length, offset and file triples.
-  Input one triple on command line, or
-  triples from stdin as tab-delimited lines
-  or complete cdx index lines.
-  In all cases by 'filename' is meant crawlid/segmentid/type/filename''',
-    epilog='''Note that if no output flag(s) is/are given,
-  the whole WARC record will be output, more efficiently than
-  would be the case if all three flags were given.''',
-    add_help=False,
-    conflict_handler='resolve',
-    formatter_class=HackFormat
-    )
-  fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s')
-  parser.add_argument('--help',help='Show help',action='help')
-  parser.add_argument('-d','--debug',help='Debug output',action='store_true')
-  parser.add_argument('-w','--warc',help='output WARC headers',
-                      action='store_true')
-  parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output',
-                      nargs='?',default=None,const=True)
-  parser.add_argument('-b','--body',help='output HTTP body',
-                      action='store_true')
-  parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
-  parser.add_argument('-p','--process',help='with -c, launches CMD only once',
-                      action='store_true')
-  parser.add_argument('-m','--module.function',help='module.function to call with a stream'),
-  parser.add_argument('-s','--save',action='store_true',
-                      help="write to a temporary file and output the name")
-  parser.add_argument('-f','--fpath',
-                      help=fphelp,
-                      default=FPAT)
-  parser.add_argument('-r','--root',nargs='?',
-                  help='File path root, create a copy there if necessary',
-                  default='/beegfs/common_crawl'),
-  parser.add_argument('-z','--zipped',
-                      help="output raw gzipped record, ignored if any of -bhw supplied",
-                      action='store_true')
-  sg=parser.add_mutually_exclusive_group()
-  sg.add_argument('-x','--index',
-                      help='take lines of triples from a cdx index file as input',
-                      action='store_true')
-  sg.add_argument('length',type=int,
-                  help='length in bytes of gzipped record',
-                  nargs='?')
-  parser.add_argument('offset',type=int,
-                      help='start position in bytes of gzipped record',
-                      nargs='?')
-  parser.add_argument('filename',
-                      help='pathname of gzipped Common Crawl WARC-format file',
-                      nargs='?')
-  # Hack the order of optional and positional in the help output
-  parser._action_groups.sort(key=lambda g:g.title)
-  #parser.print_help()
-  pa=parser.parse_args(sys.argv[1:])
-  #print(pa,file=sys.stderr)
-  if pa.length is not None:
-    # We have to enforce our own check..
-    if pa.offset is None or pa.filename is None:
-      parser.error("length, offset and filename must all be supplied together")
-  if isinstance(pa.headers,str):
-    pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(','))
- 
-  buf=bytearray(128*1024*1024)
-
-  whole=not (pa.warc or pa.headers or pa.body)
-  if pa.save:
-    _output=_output_tmpfile
-    import tempfile
-  elif pa.cmd:
-    _output = _output_subproc
-  else:
-    _output = _output_stdout
-  if pa.cmd and pa.process:
-      launch(pa.cmd)
-  # three different ways to process
-  if pa.index:
-    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
-    for l in sys.stdin:
-      m=CDX.search(l)
-      if m is None:
-        if l.find('/robotstxt/')>-1:
-          continue
-        print("index line problem: \"%s\""%l,file=sys.stderr,end='')
-        exit(2)
-      filename=pa.fpath%(m[3:7])
-      process(pa,buf,filename,
-              int(offset:=m[2]),int(length:=m[1]),whole)
-  elif pa.length is not None:
-    print(pa.filename,file=sys.stderr)
-    process(pa,buf,pa.fpath%tuple(pa.filename.split('/')),
-            pa.offset,pa.length,whole)
-  else:
-    print("Reading length, offset, filename tab-delimited triples from stdin...",
-          file=sys.stderr)
-    for l in sys.stdin:
-      try:
-        (length,offset,filename)=l.rstrip().split('\t')
-        length=int(length)
-        offset=int(offset)
-      except ValueError as e:
-        parser.error('Invalid input line: %s\n "%s"'%(e,l))
-      process(pa,buf,pa.fpath%tuple(filename.split('/')),
-              offset,length,whole)
-  # processing done one way or another
-  if pa.cmd and pa.process:
-    windup(length,offset,filename)
-  # if pa.save and pa.process, deleting temp files is down to cmd
-if __name__ == "__main__":
-    main()
--- a/bin/lmh_warc.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-'''Extract identifying info + LastModified header value for all entries
-   that have one
-
-   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''
-
-import re,warc,sys,glob,codecs
-
-TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
-DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
-LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
-
-DTAB=bytearray(range(256))
-DDEL=b'TZ-:'
-
-OUT=open(sys.stdout.fileno(),'wb')
-
-def showmeLMH(wtype,buf,part):
-  global URI, DATE, SEGMENT, FILETYPE, FILENO
-  if part==1:
-    if (m:=TUPAT.search(buf)):
-      URI=m[1]
-    else:
-      raise ValueError(b"No target URI in %s ??"%buf)
-    if (md:=DPAT.search(buf)):
-      DATE=md[1]
-    else:
-      raise ValueError(b"No date in %s ??"%buf)
-  else:
-    mm=LMPAT.search(buf)
-    OUT.write(URI)
-    if mm:
-      OUT.write(b'\t')
-      OUT.write(DATE.translate(DTAB,DDEL))
-      OUT.write(b'\t')
-      OUT.write(SEGMENT)
-      OUT.write(b'\t')
-      OUT.write(FILETYPE)
-      OUT.write(b'\t')
-      OUT.write(FILENO)
-      OUT.write(b'\t')
-      OUT.write(mm[1])
-    OUT.write(b'\n')
-
-(CCdate, segment, filetype, fileno) = sys.argv[1:]
-fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
-  CCdate, segment, filetype, fileno)
-
-SEGMENT=codecs.encode(segment,'ascii')
-FILETYPE=codecs.encode(filetype,'ascii')
-FILENO=codecs.encode(fileno,'ascii')
-
-warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
-
--- a/bin/merge_date.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,144 +0,0 @@
-#!/usr/bin/python3
-'''Add timestamps from Last-Modified-dated (ks.tsv) files into
-   that year's index
-
-Usage: merge_date.py ksvstream cdx-dir outdir
-
-ksvstream consists of tab-separated key, CC date, url and Unix timestamp
-''' # '
-
-import sys, io, os, os.path, time, re
-from isal import igzip
-
-
-DEBUG = 0
-while sys.argv[1] == '-d':
-  sys.argv.pop(1)
-  DEBUG += 1  
-
-XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
-NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
-
-RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
-b'(crawldiagnostics|robotstxt)/')
-SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
-                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
-                     b'=[^&]*)')
-ISESSION = re.compile(SESSION.pattern,flags=re.I)
-URL=re.compile(b'\{"url": "([^"]*)"')
-WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
-
-# Above based on this from broken Java code:
-# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
-#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
-#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
-#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
-#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
-#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
-
-#print(sys.argv[3],NPATH,file=sys.stderr)
-
-os.makedirs(sys.argv[3], exist_ok=True)
-
-FN = 0
-
-XCNT = WCNT = 0
-DCNT = 0
-
-XF = igzip.IGzipFile(filename=XPATH%0)
-NF = open(NN:=(NPATH%0),'wb')
-
-def nextLine():
-  '''Move on to next index file if current has run out'''
-  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT
-  while True:
-    xl=XF.readline()
-    XCNT += 1
-    if xl == b'':
-      # need to move to next index file
-      FN += 1
-      XF.close()
-      NF.close()
-      print(NN, flush=True) # so we can compress it
-      print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True)
-      time.sleep(0.1) # so they flush?
-      XN=XPATH%FN
-      if not os.path.exists(XN):
-        return None
-      XF = igzip.IGzipFile(filename=XN)
-      NF = open((NN:=NPATH%FN), 'wb')
-      xl = XF.readline()
-      WCNT = XCNT = 1
-    if WARC.search(xl):
-      WCNT += 1
-      return xl
-    else:
-      NF.write(xl)
-      if DEBUG:
-        sys.stderr.write("out_rc\n")
-
-
-def nextDate(df,dn):
-  global DEBUG, DCNT, XCNT
-  dl = df.readline()
-  if dl == b'':
-    # write out the last of the last index file, if any
-    return "", "", "", 0
-  if DEBUG:
-    sys.stderr.write("dl%s: %s\n"%(dn,dl))
-  dkey, ddate, durl, dtime = dl.split(b'\t')
-  DCNT += 1
-  return dkey, ddate, durl, dtime
-
-with open(sys.argv[1], 'rb') as df:
-  DCNT = 0
-
-  dkey, ddate, durl, dtime = nextDate(df,1)
-
-  while (xl := nextLine()) is not None:
-    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
-    m = URL.match(xprops)
-    if m:
-      xurl = m[1]
-    else:
-      raise ValueError("No url in %s"%xprops)
-    if DEBUG:
-      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
-                                            for xp in (xkey, xdate, xurl))))
-    if dkey==xkey and ddate==xdate and durl==xurl:
-      # Got it
-      NF.write(xkey)
-      NF.write(b' ')
-      NF.write(xdate)
-      NF.write(b' ')
-      NF.write(xprops[:-2])
-      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
-      if DEBUG:
-        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
-                                             for xp in (xkey, xdate, xurl))))
-        sys.stderr.write(" %d\n"%int(dtime[:-3]))
-
-      dkey, ddate, durl, dtime = nextDate(df,2)
-      continue
-    else:
-      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
-        # we've missed something, disaster looms
-        print("Fail2:"
-               "      xkey: %s\n"
-               "      dkey: %s\n"
-               "      xdate: %s\n"
-               "      ddate: %s\n"
-               "      xurl: %s\n"
-               "      durl: %s\n"
-               "FN: %s XCNT: %s DCNT: %s\n"
-               "xl: %s"%(xkey, dkey, xdate, ddate,
-                         xurl, durl,
-                         FN, XCNT, DCNT, xl),
-              file=sys.stderr)
-        # try to force recovery
-        dkey, ddate, durl, dtime = nextDate(df,3)
-        continue
-      # else fall through to write
-    NF.write(xl)
-    if DEBUG:
-      sys.stderr.write("out_nl\n")
--- a/bin/per_segment.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-#!/usr/bin/python3
-'''refactor a per-cdx count table to be per-segment
-input on STDIN
-Usage: per_segment segment-column
-Assumes column 0 is empty, count is in column 1
-Segment column is 0-origin
-'''
-
-import sys
-
-c=int(sys.argv[1])
-
-ss=[dict() for i in range(100)]
-
-for l in sys.stdin:
-  try:
-    cc=l.split('\t')
-    s=int(cc.pop(c))
-    n=int(cc.pop(1))
-    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
-    #print(s,n,cc,ll,sep='|')
-    #exit(0)
-    t=ss[s].get(ll,0)
-    ss[s][ll]=t+n
-  except:
-    sys.stdout.write(l)
-    print(cc)
-    exit(1)
-
-# note this won't work if c is last column!
-for s in range(100):
-  with open('s%s.tsv'%s,'w') as f:
-    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
-      f.write(str(c))
-      f.write('\t')
-      f.write(l)
--- a/bin/percent_encode.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-'''Handle unquoting of non-UTF-8 bytes by %-encoding them'''
-import codecs
-
-def percent_encode(ude):
-  #print(ude.object,ude.object[ude.start:ude.end])
-  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
-          ude.end)
-
-codecs.register_error('percent',percent_encode)
--- a/bin/plinks.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-import sys,pdfx,traceback,os
-from datetime import datetime
-
-def run(file):
-  try:
-    pdf=pdfx.PDFx(file)
-    links=pdf.get_references_as_dict()
-    if bool(links) and (links.get('scrape',False) or
-                        links.get('annot',False)):
-      for k in links.keys():
-        for l in links[k]:
-          print("%s\t%s"%(k,l))
-    else:
-      print("None")
-  except Exception as e:
-    if str(e)=='Unexpected EOF':
-      print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
-      print("badpdf")
-    else:
-      print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
-      traceback.print_exc(file=sys.stderr)
-
-if sys.argv[1]=='-':
-  i=0
-  for l in sys.stdin:
-    print(i,file=sys.stderr)
-    i+=1
-    f=l.rstrip()
-    if os.path.getsize(f)==1048576: # truncated
-      print("truncated",file=sys.stderr)
-      print("truncated")
-    else:
-      run(f)
-    os.unlink(f)
-else:
-  run(sys.argv[1])
--- a/bin/sort_date.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,142 +0,0 @@
-#!/usr/bin/python3
-'''Process output of lmh_warc [new 3-column version]
-   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
-'''
-
-# Assumes you have used grep -v $'\t' on input for speed
-# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
-#  to fix a common 'bad' timestamp (~ .2% of inputs)
-
-import email.utils
-import sys
-from urllib.parse import urlsplit, quote, unquote
-import surt
-
-import re, codecs
-from itertools import chain
-
-WPAT = re.compile('(,www\\d*)+\\)')
-
-# Thanks to https://stackoverflow.com/a/8776871
-import locale
-from functools import cmp_to_key
-
-def percent_encode(ude):
-  #print(ude.object,ude.object[ude.start:ude.end])
-  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
-          ude.end)
-
-codecs.register_error('percent',percent_encode)
-
-def _u_esc(c):
-  if c<65536:
-    return '\\u%04X'%c
-  else:
-    return '\\U%08X'%c
-
-def java_unicode_encode(ude):
-  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
-  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
-          ude.end)
-
-codecs.register_error('java_unicode',java_unicode_encode)
-
-# From RFC-3986:
-# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
-# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
-#                / "*" / "+" / "," / ";" / "="
-# But # _is_ escaped in Java surt results
-#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
-
-# Note also that although quote already does _not_ quote - . / _ ~
-#  they are included below as that's what we find in surt.surt 0.3.1
-
-# Also, Java surt strips _all_ leading 'www\d*.',
-#  where python3 surt only strips the first one.
-
-# And Java strips so-called option session-ids, but python doesn't
-
-import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
-import surt.URLRegexTransformer
-
-ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
-
-IDMAP=bytes.maketrans(ident,ident)
-
-# For removal of non-printing characters:
-#  Note, this is only a guess, only example so are is DEL
-NONPRINT= ''.join(chr(i) for i in chain(range(9),
-                                      range(14,32),
-                                      [127] # DEL
-                                      )).encode('latin-1')
-
-def notDefaultCanon(hu,**options):
-  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
-    # Try to fix the incompatibility between Java and 
-    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
-    #  and it should!  See "After this line:
-    # 
-    # 15,225,107,143)" in .../azure/notes.txt
-    try:
-      bytestrs = hu.host.split(b'.')
-      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
-    except ValueError:
-      pass
-  if hu.query:
-    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
-  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
-
-# Hack this to reproduce the Java bug
-surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
-    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
-    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
-    ]
-
-# Above based on this from broken Java code:
-# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
-#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
-#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
-#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
-#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
-#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
-
-def cdx_key(uristring):
-  _surt = quote(unquote(surt.surt(unquote(uristring),
-                                  canonicalizer=notDefaultCanon),
-                        errors='percent'),
-                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
-                  ).lower()
-                # Wrt \x7f (DEL), see discussion in notes wrt
-                #   "biz,televida)" case
-                # It remains to be seen whether other non-printing bytes
-                #  will need to be treated as 'safe'
-  return WPAT.sub(')',_surt)
-
-def keyed(l):
-  uri, cc_stamp, dateTime = l.split('\t',2)
-  #print('ul',uri,file=sys.stderr)
-  try:
-    try:
-      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
-    except OverflowError:
-      epoch = 32535215999.0
-    return ((cdx_key(uri), cc_stamp, uri), epoch)
-  except (TypeError,IndexError,ValueError) as e:
-    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
-    return
-
-fstr = sys.argv[1]
-
-with open(fstr,"r") as ff:
-  # crucial that the following is done _after_ the file is opened
-  #  with the default (utf-8) locale!
-  locale.setlocale(locale.LC_ALL, "C")
-  ctk=cmp_to_key(locale.strcoll)
-  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
-                   key=lambda x:x[0]):
-    print(key[0],key[1],
-          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
-          ts,sep='\t')
--- a/bin/spearman.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,188 +0,0 @@
-#!/usr/bin/env python3
-'''Rank correlation processing for a csv tabulation of counts by segment 
-   First column is for whole crawl, then 100 columns for segs 0-99
-   Each row is counts for some property, e.g. mime-detected or tld
-
-   For example, assuming all.tsv has the whole-crawl warc-only counts
-   and s...tsv have the segment counts, all with counts in column 1,
-
-   tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
-
-   will produce such a file with
-     * 100 rows, one for each of the top 100 counts
-     * 101 columns, 0 for all and 1--100 for segs 0--99
-
-   Usage: python3 -i spearman.py name id
-     where name.csv has the input
-'''
-     
-import numpy as np
-from numpy import loadtxt
-from scipy import stats
-import statsmodels.api as sm
-import matplotlib.pyplot as plt
-import pylab
-
-import sys, math
-
-def qqa():
-  # q-q plot for the whole crawl
-  sm.qqplot(all, line='s')
-  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
-  plt.show()
-
-def qqs():
-  # q-q plots for the best and worst (by variance) segments
-  global xv, xworst, xbest
-  xv=[d.variance for d in xd]
-  xworst=xv.index(max(xv))
-  xbest=xv.index(min(xv))
-  print(xbest,xworst)
-  sm.qqplot(x[xbest], line='s')
-  plt.gca().set_title('Best segment (least variance): %s'%xbest)
-  plt.show()
-  sm.qqplot(x[xworst], line='s')
-  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
-  plt.show()
-
-def plot_x(sort=False,block=True,all_only=True,title=None):
-  # Make these two subplots, w. and w/o sorting
-  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
-  #  for legend hacking
-  if sort:
-    aso=np.argsort(-all)
-    plot_all=all[aso]
-    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
-  else:
-    plot_all=all
-    plot_x=[xd[i].mean for i in range(N)]
-  if title is None:
-    l1='Rank correlation of segment x whole crawl'
-    l2='Mean of segment x whole crawl'
-    plt.legend(loc='best',fontsize='small')
-  else:
-    l1=l2=None
-  plt.plot(plot_all,'rx',label=l1)
-  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
-  if not(all_only):
-    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
-    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
-  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
-  plt.grid(True)
-  if title is not None:
-    plt.title(title)
-  plt.show(block=block)
-
-def hist_x(align='mid'):
-  hist(xm,xsd,[xd[i].mean for i in range(N)],
-       'Mean of rank correlation of each segment x all other segments',
-       align)
-
-def hist_all(align='mid'):
-  hist(all_m,np.sqrt(all_s.variance),all,
-       'Rank correlation of each segment x whole crawl %s'%id,
-       align)
-
-def hist(m,sd,hh,title,align):
-  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
-  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
-  sdax=hax.twiny()
-  hax.hist(hh,color='lightblue',align=align)
-  hax.set_title(title)
-  for s,v in sdd:
-       sdax.plot([v,v],[0,18],'b')
-  sdax.set_xlim(hax.get_xlim())
-  sdax.set_ylim(hax.get_ylim())
-  sdax.set_xticks([v for s,v in sdd])
-  sdax.set_xticklabels([str(s) for s,v in sdd])
-  plt.show()
-
-def ci(rho,n,conf=0.95):
-  # Courtesy of https://stats.stackexchange.com/a/18904
-  # rho is (rank) correlation, n is sample size
-  stderr=1.0/math.sqrt(n-3)
-  z=stats.norm.ppf(1.0-((1.0-conf)/2))
-  delta=z*stderr
-  lower=math.tanh(math.atanh(rho)-delta)
-  upper=math.tanh(math.atanh(rho)+delta)
-  return (lower,upper)
-
-def plot_ci(rhos,n,trim=None,conf=0.95):
-   # rhos are (rank) correlation values
-   rhos_s=rhos[(-rhos).argsort()]
-   if trim is None:
-     l=len(rhos)
-   else:
-     rhos_s=rhos_s[:trim]
-     l=trim
-   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
-   ue=cc[1]-rhos_s
-   le=rhos_s-cc[0]
-   #for i in range(len(rhos)):
-     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
-   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
-   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
-   plt.show()
-
-def first_diff(ranks):
-  # first disagreement with baseline == {1,2,...}
-  for i in range(len(ranks)):
-    if ranks[i]!=i+1.0:
-      return i
-  return i+1
-
-def ranks():
-  # Combine segment measures:
-  #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
-  # convert to ranks, smallest value == highest rank
-  all_ranked=stats.rankdata(-all,method='average') # invert since
-                                                   #  large corr is good
-  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
-                                                  # small corr variance is good
-  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
-                                                   # invert since
-                                                   #  large mean corr is good
-  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
-                                                   # invert since
-                                                   #  large first diff is good
-  return np.array([[i,
-                    all_ranked[i],
-                    x_variance_ranked[i],
-                    x_mean_ranked[i],
-                    fd_ranked[i]] for i in range(N)])
-
-def main():
-  global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
-  global aa, aa_by_all, N
-  counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
-  id=sys.argv[2]
-  N=counts.shape[1]-1
-  # "If axis=0 (default), then each column represents a variable, with
-  #        observations in the rows"
-  # So each column is a sequence of counts, for whole crawl in column 0
-  #   and for segments 0--N-1 in columns 1--N
-  corr=stats.spearmanr(counts,nan_policy='omit').correlation
-
-  all=corr[0][1:]
-  all_s=stats.describe(all)
-  all_m=all_s.mean
-
-  x=np.array([np.concatenate((corr[i][1:i],
-                              corr[i][i+1:])) for i in range(1,N+1)])
-  # The above, although transposed, works because the correlation matrix
-  #  is symmetric
-  xd=[stats.describe(x[i]) for i in range(N)]
-  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
-  xm=xs.mean
-  xsd=np.sqrt(xs.variance)
-
-  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
-
-  aa=ranks()
-  aa_by_all=aa[aa[:,1].argsort()]
-
-### I need to review rows, e.g. counts[0] is an array of N+1 counts
-###   for the most common label in the complete crawl,
-###   from the complete crawl and all the segments
-### versus columns, e.g. counts[:,0] is an array of N decreasing counts
-###   for all the labels in the complete crawl
--- a/bin/test_warc.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-import warc,sys
-
-OUT=open(sys.stdout.fileno(),'wb')
-
-if (debug:=(sys.argv[1]=='-d')):
-  sys.argv.pop(1)
-
-tt=int(sys.argv.pop(1))
-
-def showme(wtype,buf,part):
-  # This should exactly reproduce a complete warc file if called
-  #  as per version 1 below
-  if debug:
-    OUT.write(b"----start %d-----\n"%part)
-  OUT.write(buf)
-  if buf[-1]!=10:
-    OUT.write(b'\r\n')
-  if part==7:
-    OUT.write(b'\r\n') # to match complete file formatting
-  if debug:
-    OUT.write(b"----end %d-----\n"%part)
-  return OUT
-
-if tt==1:
-  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug)
-elif tt==2:
-  warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug)
-elif tt==3:
-  warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug)
-elif tt==4:
-  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug)
-elif tt==5:
-  warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- a/bin/warc.py	Wed Sep 27 17:29:51 2023 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,167 +0,0 @@
-#!/usr/bin/env python3
-'''Stream a warc format file, unzipping if necessary, invoking a
-callback on each record.  Callback can be limited by WARC-Type, record
-part'''
-
-import sys,io
-from isal import igzip
-
-RESP = b'response'
-REQ = b'request'
-META = b'metadata'
-INFO = b'warcinfo'
-
-def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
-  '''parts is a bit-mask:
-     1 for warc header;
-     2 for req/resp HTTP header, warcinfo/metadata features;
-     4 for req/resp body'''
-  # should do some sanity checking wrt parts and types
-  types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
-  nb=0
-  if filename.endswith(".gz"):
-    stream=igzip.IGzipFile(filename=filename)
-  else:
-    stream=open(filename,'rb',0)
-  bufSize=2*1024*1024
-  hdrMax=16*1024
-  buf=bytearray(bufSize)
-  bufView=memoryview(buf)
-  fpos=bl=stream.readinto(buf)
-  bp=0
-  done=False
-  while True:
-    while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty
-      bp+=2
-    start_1=bp
-    if not buf.startswith(b'WARC/1.0\r\n',bp):
-      if done and bl-bp==0:
-        # really done
-        return
-      raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename,
-                                                                   bp,bl,fpos,
-         (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
-                                                                     bl-bp))
-    bp+=10
-    wtype=None
-    length=None
-    state=1
-    tr=None # Was this record truncated?
-    while not buf.startswith(b'\r\n',bp):
-      # there should always be enough in the buffer to complete this loop,
-      #  because of the buffer update logic below
-      eol=buf.index(b'\r\n',bp)+2
-      if buf.startswith(b"Content-Length: ",bp):
-        length=wl=int(bufView[bp+16:eol-2])
-      elif buf.startswith(b"WARC-Truncated: ",bp):
-        if bp+16==eol-2:
-          tr=b"EMPTY"
-        else:
-          tr=bytes(bufView[bp+16:eol-2])
-      elif buf.startswith(b'WARC-Type: ',bp):
-        if buf.startswith(b's',bp+13):
-          wtype = RESP
-        elif buf.startswith(b'q',bp+13):
-          wtype = REQ
-        elif buf.startswith(b'm',bp+11):
-          wtype = META
-        elif buf.startswith(b'w',bp+11):
-          wtype = INFO
-        else:
-          raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
-                             bytes(bufView[bp+11:eol-2]),filename,
-                             fpos-(bl-bp)))
-      bp=eol
-    bp=eol+2
-    if done:
-      if (bp+length)>bl:
-        raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
-                         length,bl,filename))
-    elif (bp+(length+hdrMax))>bl:
-      # Need more data
-      if wtype in types:
-        # we need to keep from start_1 to bl
-        keepFrom=start_1
-        keepLen=bl-keepFrom
-        buf[0:keepLen]=bufView[keepFrom:bl]
-        eol=eol-start_1
-        start_1=0
-        bp=eol+2
-      else:
-        # we can skip the rest of this part
-        if (bp+length)<=bl:
-          # we have at least some bytes from the next part
-          keepLen=bl-(bp+length)
-          buf[0:keepLen]=bufView[bl-keepLen:bl]
-        else:
-          # we don't have all of the bytes from the current part
-          #  so can skip the rest of it
-          keepLen=0
-          fpos=stream.seek(fpos+(bp+length-bl))
-        bp=0
-      spaceToFill=bufSize-keepLen
-      with memoryview(buf)[keepLen:bufSize] as xBuf:
-        nb=stream.readinto(xBuf)
-      fpos+=nb
-      bl=keepLen+nb
-      if nb<spaceToFill:
-        done=True
-      if wtype not in types:
-        continue
-    if (wtype in types):
-      # Output whole or part 1 as required
-      if whole:
-        bp+=length
-        OUT=callback(wtype,bufView[start_1:bp],7)
-        continue
-      elif (parts & 1):
-        OUT=callback(wtype,bufView[start_1:eol],1)
-      if parts!=1:
-        while buf.startswith(b'\r\n',bp):
-          bp+=2
-        start_2=bp
-        eob=bp+length
-        while buf.startswith(b'\r\n',eob-2):
-          eob-=2
-        # Only output parts (2 = HTTP header, 4 = body) that are wanted
-        if parts & 2:
-          if wtype is META or wtype is INFO:
-            # rest of the part
-            OUT=callback(wtype,bufView[start_2:eob],2)
-          else:
-            # request and response have http headers
-            eo2=buf.index(b'\r\n\r\n',start_2)
-            OUT=callback(wtype,bufView[start_2:eo2+2],2)
-        if parts & 4:
-          for L in rec_text:
-            if state==2:
-              # HTTP header
-              wl -= len(L)
-              if not (L==b"" or L.startswith(b"\r")):
-                # Non-empty, it's (a continuation of) a header
-                if bl is None and L.startswith(b"Content-Length: "):
-                  bl=int(L[16:].rstrip())
-              else:
-                # Blank line, HTTP header is finished
-                if parts & 2:
-                  callback(wtype,bufView[start_2:start_2+L_start],2)
-                state=4
-                # The above is just for sanity, because we do _not_
-                #  continue with the outer loop,
-                #  since we can now block-output the entire rest of the
-                #  input buffer.
-                if bl is not None:
-                  if bl!=wl:
-                    print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
-                          (length,offset,filename,wl,bl,tr),file=sys.stderr)
-                # HTTP body
-                balance=start_2+rec_text.tell()
-                #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
-                # Output whatever is left
-                if parts & 4:
-                  callback(wtype,bufView[balance:balance+wl],4)
-                state=1
-
-              L_start=rec_text.tell()
-    bp+=length
-    #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/build_idx.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+'''Turn a merge_nnn.log file into a cluster.idx file
+   We cheat and use the old cluster.idx to save having to read
+   all the cdx-....gz files'''
+import sys
+
+with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx:
+  i=-1
+  curpos=0
+  target="cdx-00%03d.gz"%i
+  log=open("/dev/null",'r') # embarassing hack
+  for ol in oidx:
+    (surt, datestamp, file, offset, length, cnt) = ol.split()
+    if file!=target:
+      i+=1
+      target="cdx-00%03d.gz"%i
+      log.close()
+      curpos=0
+      log=open('merge_%d.log'%(i+1),'r')
+      hdr=log.readline()
+      (j,f) = hdr.split()
+      sys.stderr.write(hdr)
+      if int(j)!=i+1:
+        raise ValueError("wrong file: i=%s, j=%s"%(i,j))
+    nl=log.readline()
+    if not nl:
+      sys.stderr.write('quiting early: %s\n'%i)
+      exit(1)
+    nlen=int(nl)
+    nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt))
+    curpos+=nlen
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/count_warc.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+import warc,sys
+
+OUT=open(sys.stdout.fileno(),'wb')
+
+if (debug:=(sys.argv[1]=='-d')):
+  sys.argv.pop(1)
+
+def countme(wtype,buf,part):
+  if debug:
+    breakpoint()
+  OUT.write(b"%d\n"%len(buf))
+
+#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
+#warc(showme,[b'response'],whole=True)
+
+warc.warc(sys.argv[1],countme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/fix_key.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+from percent_encode import percent_encode
+from urllib.parse import quote, unquote
+import sys
+
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+# Note also that quote already does _not_ quote - . / _ ~
+
+# Also, Java surt strips _all_ leading 'www.',
+#  where python3 surt only strips the first one.
+
+with open(sys.argv[1],"r") as f:
+  for l in f:
+    while l.endswith(',www',0,ploc:=l.index(')')):
+      l=l[:ploc-4]+l[ploc:]
+    if '%' in l:
+      (key,wt,ts)=l.split('\t')
+      sys.stdout.write(quote(unquote(key,errors='percent'),
+                             safe='!"$&\'()*+,:;<=>?@[\\]^`{|}').lower())
+      sys.stdout.write('\t')
+      sys.stdout.write(wt)
+      sys.stdout.write('\t')
+      sys.stdout.write(ts)
+    else:
+      sys.stdout.write(l)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/ix.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+'''Extract response records from Common Crawl WARC-format files
+given length, offset and filename triples.
+Input one triple on command line, or
+triples from stdin as tab-delimited lines
+or complete cdx index lines.
+In all cases by 'filename' is meant crawlid/segmentid/type/filename
+
+Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
+
+import sys, argparse, regex, os, shutil, io, gzip, time, shlex
+from isal import igzip
+from subprocess import Popen, PIPE
+#import asyncio
+
+HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
+BINOUT=sys.stdout.buffer
+FPAT="/%s/%s/orig/%s/%s"
+
+CMD_PROC=None
+TMPFILENAME=None
+
+class HackFormat(argparse.RawDescriptionHelpFormatter):
+  def format_help(self):
+    FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
+    return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
+                          FOO)
+
+def process(options,buf,filename,offset,length,whole):
+    try:
+      process0(options,buf,filename,offset,length,whole)
+    except Exception as e:
+      if options.debug:
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+      else:
+        print("Process fail: %s, input line:\n %s"%(e,l),
+              file=sys.stderr,end='')
+      exit(3)
+
+def process0(options,buf,filename,offset,length,whole):
+  global TMPFILENAME, TMPFILE
+  if options.save:
+    (tf,TMPFILENAME)=tempfile.mkstemp()
+    TMPFILE=open(tf,mode='wb')
+  if options.cmd and not options.process:
+    launch(options.cmd)
+  process1(options,buf,filename,offset,length,whole)
+  if options.save:
+    TMPFILE.close()
+    if options.cmd:
+      _output_subproc(bytes(TMPFILENAME,'utf-8'))
+      _output_subproc(b"\n")
+    else:
+      BINOUT.write(bytes(TMPFILENAME,'utf-8'))
+      BINOUT.write(b"\n")
+  if options.cmd:
+    if not options.process:
+      windup(filename,options,length)
+    if options.save:
+      os.unlink(TMPFILENAME)
+      TMPFILENAME=None
+  elif options.save:
+    print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr)
+    TMPFILENAME=None
+
+def launch(cmd):
+  global CMD_PROC, BINOUT
+  CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0)
+  BINOUT=CMD_PROC.stdin
+
+def windup(length,offset,filename):
+  # Wind up subproc
+  BINOUT.close()
+  if CMD_PROC.wait()!=0:    # could/should be async?
+    print("subproc of %s:%s:%s failed with %s"%(length,offset,filename,
+                                                CMD_PROC.returncode),
+          file=sys.stderr)
+  
+def _output_tmpfile(buf):
+  TMPFILE.write(buf)
+
+def _output_stdout(buf):
+  BINOUT.write(buf)
+
+def _output_subproc(buf):
+  toWrite=len(buf)
+  while toWrite>0:
+    toWrite -= BINOUT.write(buf)    
+
+def process1(options,buf,filename,offset,length,whole):
+  root=options.root
+  rfn=root+filename
+  if root!="/beegfs/common_crawl":
+    # Support using ramdisk or other local disk as a faster cached
+    if not os.path.exists(rfn):
+      if not os.path.exists(os.path.dirname(rfn)):
+        os.makedirs(os.path.dirname(rfn))
+      with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \
+              io.FileIO(rfn,'w') as outfile:
+        #shutil.copyfileobj(infile,outfile,128*1024*1024)
+        while True:
+          l=infile.readinto(buf)
+          if l==0:
+            break
+          outfile.write(memoryview(buf)[:l])
+  file=open(rfn,'rb',0)
+  file.seek(offset)
+  bv=memoryview(buf)[:length]
+  nb=file.readinto(bv)
+  file.close()
+  if nb!=length:
+    raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name,
+                                                                  nb,length,offset))
+  if whole and options.zipped:
+    _output(bv)
+    return
+  gzip_chunk = io.BytesIO(bv)
+  uv=memoryview(buf)[length:]
+  with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
+    ll=0
+    while True:
+      l=gzip_fin.readinto(uv)
+      if not l:
+        break
+      ll+=l
+    cb=memoryview(uv)[:ll]
+    if whole:
+      _output(cb)
+      return
+  # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
+  state=0
+  tr=None # Was this record truncated?
+  bl=None # for HTTP Content-Length for the length of the body?
+  # Could we make this faster by working purely within the cb memoryview?
+  # It would be messy, but avoid copying huge amounts
+  # The outer loop would just be something like
+  #   clbv=memoryview(bytearray(b"Content-Length: "))
+  #   i=s=0
+  #   while i<ll:
+  #     if cb[i]==10: # need to handle \r\n
+  #       L=cb[s:i]
+  #       s=i=i+1
+  #       if L[:16]==clbv:
+  #         wl=int(L[16:])
+  #     else:
+  #       i+=1
+  #
+  with io.BytesIO(cb) as clear_text:
+    for L in clear_text:
+      if state==0:
+        # WARC header
+        if L.startswith(b"Content-Length: "):
+          wl=int(L[16:].rstrip())
+        elif L.startswith(b"WARC-Truncated: "):
+          tr=L[16:].rstrip()
+          tr="EMPTY" if tr=="" else tr
+        elif L==b"" or L.startswith(b"\r"): # for idempotency
+          # Blank line, WARC header is finished
+          if not (options.headers or options.body):
+            return
+          state=1
+          # Note we preserve the empty line
+        if options.warc:
+          _output(L)
+        continue
+      if state==1:
+        # HTTP header
+        wl -= len(L)
+        if not (L==b"" or L.startswith(b"\r")):
+          # Non-blank, it's a header
+          (h,_,v)=L.partition(b": ")
+          if bl is None and (h==b"Content-Length"):
+            bl=int(v)
+          if options.headers:
+            if isinstance(options.headers,dict):
+              if h in options.headers:
+                options.headers[h]=v
+            else:
+              _output(L)
+        else:
+          # Blank line, HTTP header is finished
+          if isinstance(options.headers,dict):
+            _output(bytes(str(options.headers),'utf-8'))
+          if not options.body:
+            return
+          if options.headers:
+            _output(L)
+          state=2
+          # The above is just for sanity, because we do _not_
+          #  continue with the outer loop,
+          #  since we can now block-output the entire rest of the
+          #  input buffer.
+          if bl is not None:
+            if bl!=wl:
+              print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                    (length,offset,filename,wl,bl,tr),file=sys.stderr)
+          # HTTP body
+          balance=clear_text.tell()
+          #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+          # Output whatever is left
+          _output(cb[balance:balance+wl])
+          return
+
+def main():
+  global _output,TMPFILE,TMPFILENAME,tempfile
+  parser = argparse.ArgumentParser(
+    description='''Extract records from warc files given length, offset and file triples.
+  Input one triple on command line, or
+  triples from stdin as tab-delimited lines
+  or complete cdx index lines.
+  In all cases by 'filename' is meant crawlid/segmentid/type/filename''',
+    epilog='''Note that if no output flag(s) is/are given,
+  the whole WARC record will be output, more efficiently than
+  would be the case if all three flags were given.''',
+    add_help=False,
+    conflict_handler='resolve',
+    formatter_class=HackFormat
+    )
+  fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s')
+  parser.add_argument('--help',help='Show help',action='help')
+  parser.add_argument('-d','--debug',help='Debug output',action='store_true')
+  parser.add_argument('-w','--warc',help='output WARC headers',
+                      action='store_true')
+  parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output',
+                      nargs='?',default=None,const=True)
+  parser.add_argument('-b','--body',help='output HTTP body',
+                      action='store_true')
+  parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
+  parser.add_argument('-p','--process',help='with -c, launches CMD only once',
+                      action='store_true')
+  parser.add_argument('-m','--module.function',help='module.function to call with a stream'),
+  parser.add_argument('-s','--save',action='store_true',
+                      help="write to a temporary file and output the name")
+  parser.add_argument('-f','--fpath',
+                      help=fphelp,
+                      default=FPAT)
+  parser.add_argument('-r','--root',nargs='?',
+                  help='File path root, create a copy there if necessary',
+                  default='/beegfs/common_crawl'),
+  parser.add_argument('-z','--zipped',
+                      help="output raw gzipped record, ignored if any of -bhw supplied",
+                      action='store_true')
+  sg=parser.add_mutually_exclusive_group()
+  sg.add_argument('-x','--index',
+                      help='take lines of triples from a cdx index file as input',
+                      action='store_true')
+  sg.add_argument('length',type=int,
+                  help='length in bytes of gzipped record',
+                  nargs='?')
+  parser.add_argument('offset',type=int,
+                      help='start position in bytes of gzipped record',
+                      nargs='?')
+  parser.add_argument('filename',
+                      help='pathname of gzipped Common Crawl WARC-format file',
+                      nargs='?')
+  # Hack the order of optional and positional in the help output
+  parser._action_groups.sort(key=lambda g:g.title)
+  #parser.print_help()
+  pa=parser.parse_args(sys.argv[1:])
+  #print(pa,file=sys.stderr)
+  if pa.length is not None:
+    # We have to enforce our own check..
+    if pa.offset is None or pa.filename is None:
+      parser.error("length, offset and filename must all be supplied together")
+  if isinstance(pa.headers,str):
+    pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(','))
+ 
+  buf=bytearray(128*1024*1024)
+
+  whole=not (pa.warc or pa.headers or pa.body)
+  if pa.save:
+    _output=_output_tmpfile
+    import tempfile
+  elif pa.cmd:
+    _output = _output_subproc
+  else:
+    _output = _output_stdout
+  if pa.cmd and pa.process:
+      launch(pa.cmd)
+  # three different ways to process
+  if pa.index:
+    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
+    for l in sys.stdin:
+      m=CDX.search(l)
+      if m is None:
+        if l.find('/robotstxt/')>-1:
+          continue
+        print("index line problem: \"%s\""%l,file=sys.stderr,end='')
+        exit(2)
+      filename=pa.fpath%(m[3:7])
+      process(pa,buf,filename,
+              int(offset:=m[2]),int(length:=m[1]),whole)
+  elif pa.length is not None:
+    print(pa.filename,file=sys.stderr)
+    process(pa,buf,pa.fpath%tuple(pa.filename.split('/')),
+            pa.offset,pa.length,whole)
+  else:
+    print("Reading length, offset, filename tab-delimited triples from stdin...",
+          file=sys.stderr)
+    for l in sys.stdin:
+      try:
+        (length,offset,filename)=l.rstrip().split('\t')
+        length=int(length)
+        offset=int(offset)
+      except ValueError as e:
+        parser.error('Invalid input line: %s\n "%s"'%(e,l))
+      process(pa,buf,pa.fpath%tuple(filename.split('/')),
+              offset,length,whole)
+  # processing done one way or another
+  if pa.cmd and pa.process:
+    windup(length,offset,filename)
+  # if pa.save and pa.process, deleting temp files is down to cmd
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh_warc.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+'''Extract identifying info + LastModified header value for all entries
+   that have one
+
+   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''
+
+import re,warc,sys,glob,codecs
+
+TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
+LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+
+DTAB=bytearray(range(256))
+DDEL=b'TZ-:'
+
+OUT=open(sys.stdout.fileno(),'wb')
+
+def showmeLMH(wtype,buf,part):
+  global URI, DATE, SEGMENT, FILETYPE, FILENO
+  if part==1:
+    if (m:=TUPAT.search(buf)):
+      URI=m[1]
+    else:
+      raise ValueError(b"No target URI in %s ??"%buf)
+    if (md:=DPAT.search(buf)):
+      DATE=md[1]
+    else:
+      raise ValueError(b"No date in %s ??"%buf)
+  else:
+    mm=LMPAT.search(buf)
+    OUT.write(URI)
+    if mm:
+      OUT.write(b'\t')
+      OUT.write(DATE.translate(DTAB,DDEL))
+      OUT.write(b'\t')
+      OUT.write(SEGMENT)
+      OUT.write(b'\t')
+      OUT.write(FILETYPE)
+      OUT.write(b'\t')
+      OUT.write(FILENO)
+      OUT.write(b'\t')
+      OUT.write(mm[1])
+    OUT.write(b'\n')
+
+(CCdate, segment, filetype, fileno) = sys.argv[1:]
+fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
+  CCdate, segment, filetype, fileno)
+
+SEGMENT=codecs.encode(segment,'ascii')
+FILETYPE=codecs.encode(filetype,'ascii')
+FILENO=codecs.encode(fileno,'ascii')
+
+warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/merge_date.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,144 @@
+#!/usr/bin/python3
+'''Add timestamps from Last-Modified-dated (ks.tsv) files into
+   that year's index
+
+Usage: merge_date.py ksvstream cdx-dir outdir
+
+ksvstream consists of tab-separated key, CC date, url and Unix timestamp
+''' # '
+
+import sys, io, os, os.path, time, re
+from isal import igzip
+
+
+DEBUG = 0
+while sys.argv[1] == '-d':
+  sys.argv.pop(1)
+  DEBUG += 1  
+
+XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
+NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
+
+RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
+b'(crawldiagnostics|robotstxt)/')
+SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
+                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
+                     b'=[^&]*)')
+ISESSION = re.compile(SESSION.pattern,flags=re.I)
+URL=re.compile(b'\{"url": "([^"]*)"')
+WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
+
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
+
+#print(sys.argv[3],NPATH,file=sys.stderr)
+
+os.makedirs(sys.argv[3], exist_ok=True)
+
+FN = 0
+
+XCNT = WCNT = 0
+DCNT = 0
+
+XF = igzip.IGzipFile(filename=XPATH%0)
+NF = open(NN:=(NPATH%0),'wb')
+
+def nextLine():
+  '''Move on to next index file if current has run out'''
+  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT, WCNT
+  while True:
+    xl=XF.readline()
+    XCNT += 1
+    if xl == b'':
+      # need to move to next index file
+      FN += 1
+      XF.close()
+      NF.close()
+      print(NN, flush=True) # so we can compress it
+      print(NN, XCNT, WCNT, DCNT,sep='\t',file=sys.stderr,flush=True)
+      time.sleep(0.1) # so they flush?
+      XN=XPATH%FN
+      if not os.path.exists(XN):
+        return None
+      XF = igzip.IGzipFile(filename=XN)
+      NF = open((NN:=NPATH%FN), 'wb')
+      xl = XF.readline()
+      WCNT = XCNT = 1
+    if WARC.search(xl):
+      WCNT += 1
+      return xl
+    else:
+      NF.write(xl)
+      if DEBUG:
+        sys.stderr.write("out_rc\n")
+
+
+def nextDate(df,dn):
+  global DEBUG, DCNT, XCNT
+  dl = df.readline()
+  if dl == b'':
+    # write out the last of the last index file, if any
+    return "", "", "", 0
+  if DEBUG:
+    sys.stderr.write("dl%s: %s\n"%(dn,dl))
+  dkey, ddate, durl, dtime = dl.split(b'\t')
+  DCNT += 1
+  return dkey, ddate, durl, dtime
+
+with open(sys.argv[1], 'rb') as df:
+  DCNT = 0
+
+  dkey, ddate, durl, dtime = nextDate(df,1)
+
+  while (xl := nextLine()) is not None:
+    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
+    m = URL.match(xprops)
+    if m:
+      xurl = m[1]
+    else:
+      raise ValueError("No url in %s"%xprops)
+    if DEBUG:
+      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
+                                            for xp in (xkey, xdate, xurl))))
+    if dkey==xkey and ddate==xdate and durl==xurl:
+      # Got it
+      NF.write(xkey)
+      NF.write(b' ')
+      NF.write(xdate)
+      NF.write(b' ')
+      NF.write(xprops[:-2])
+      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
+      if DEBUG:
+        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
+                                             for xp in (xkey, xdate, xurl))))
+        sys.stderr.write(" %d\n"%int(dtime[:-3]))
+
+      dkey, ddate, durl, dtime = nextDate(df,2)
+      continue
+    else:
+      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
+        # we've missed something, disaster looms
+        print("Fail2:"
+               "      xkey: %s\n"
+               "      dkey: %s\n"
+               "      xdate: %s\n"
+               "      ddate: %s\n"
+               "      xurl: %s\n"
+               "      durl: %s\n"
+               "FN: %s XCNT: %s DCNT: %s\n"
+               "xl: %s"%(xkey, dkey, xdate, ddate,
+                         xurl, durl,
+                         FN, XCNT, DCNT, xl),
+              file=sys.stderr)
+        # try to force recovery
+        dkey, ddate, durl, dtime = nextDate(df,3)
+        continue
+      # else fall through to write
+    NF.write(xl)
+    if DEBUG:
+      sys.stderr.write("out_nl\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/per_segment.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,36 @@
+#!/usr/bin/python3
+'''refactor a per-cdx count table to be per-segment
+input on STDIN
+Usage: per_segment segment-column
+Assumes column 0 is empty, count is in column 1
+Segment column is 0-origin
+'''
+
+import sys
+
+c=int(sys.argv[1])
+
+ss=[dict() for i in range(100)]
+
+for l in sys.stdin:
+  try:
+    cc=l.split('\t')
+    s=int(cc.pop(c))
+    n=int(cc.pop(1))
+    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
+    #print(s,n,cc,ll,sep='|')
+    #exit(0)
+    t=ss[s].get(ll,0)
+    ss[s][ll]=t+n
+  except:
+    sys.stdout.write(l)
+    print(cc)
+    exit(1)
+
+# note this won't work if c is last column!
+for s in range(100):
+  with open('s%s.tsv'%s,'w') as f:
+    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
+      f.write(str(c))
+      f.write('\t')
+      f.write(l)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/percent_encode.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,9 @@
+'''Handle unquoting of non-UTF-8 bytes by %-encoding them'''
+import codecs
+
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('percent',percent_encode)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/sort_date.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,142 @@
+#!/usr/bin/python3
+'''Process output of lmh_warc [new 3-column version]
+   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
+'''
+
+# Assumes you have used grep -v $'\t' on input for speed
+# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
+#  to fix a common 'bad' timestamp (~ .2% of inputs)
+
+import email.utils
+import sys
+from urllib.parse import urlsplit, quote, unquote
+import surt
+
+import re, codecs
+from itertools import chain
+
+WPAT = re.compile('(,www\\d*)+\\)')
+
+# Thanks to https://stackoverflow.com/a/8776871
+import locale
+from functools import cmp_to_key
+
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('percent',percent_encode)
+
+def _u_esc(c):
+  if c<65536:
+    return '\\u%04X'%c
+  else:
+    return '\\U%08X'%c
+
+def java_unicode_encode(ude):
+  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
+  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('java_unicode',java_unicode_encode)
+
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+
+# Note also that although quote already does _not_ quote - . / _ ~
+#  they are included below as that's what we find in surt.surt 0.3.1
+
+# Also, Java surt strips _all_ leading 'www\d*.',
+#  where python3 surt only strips the first one.
+
+# And Java strips so-called option session-ids, but python doesn't
+
+import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
+import surt.URLRegexTransformer
+
+ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
+
+IDMAP=bytes.maketrans(ident,ident)
+
+# For removal of non-printing characters:
+#  Note, this is only a guess, only example so are is DEL
+NONPRINT= ''.join(chr(i) for i in chain(range(9),
+                                      range(14,32),
+                                      [127] # DEL
+                                      )).encode('latin-1')
+
+def notDefaultCanon(hu,**options):
+  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
+    # Try to fix the incompatibility between Java and 
+    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
+    #  and it should!  See "After this line:
+    # 
+    # 15,225,107,143)" in .../azure/notes.txt
+    try:
+      bytestrs = hu.host.split(b'.')
+      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
+    except ValueError:
+      pass
+  if hu.query:
+    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
+  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
+
+# Hack this to reproduce the Java bug
+surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
+    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    ]
+
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
+
+def cdx_key(uristring):
+  _surt = quote(unquote(surt.surt(unquote(uristring),
+                                  canonicalizer=notDefaultCanon),
+                        errors='percent'),
+                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
+                  ).lower()
+                # Wrt \x7f (DEL), see discussion in notes wrt
+                #   "biz,televida)" case
+                # It remains to be seen whether other non-printing bytes
+                #  will need to be treated as 'safe'
+  return WPAT.sub(')',_surt)
+
+def keyed(l):
+  uri, cc_stamp, dateTime = l.split('\t',2)
+  #print('ul',uri,file=sys.stderr)
+  try:
+    try:
+      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    except OverflowError:
+      epoch = 32535215999.0
+    return ((cdx_key(uri), cc_stamp, uri), epoch)
+  except (TypeError,IndexError,ValueError) as e:
+    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+    return
+
+fstr = sys.argv[1]
+
+with open(fstr,"r") as ff:
+  # crucial that the following is done _after_ the file is opened
+  #  with the default (utf-8) locale!
+  locale.setlocale(locale.LC_ALL, "C")
+  ctk=cmp_to_key(locale.strcoll)
+  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+                   key=lambda x:x[0]):
+    print(key[0],key[1],
+          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
+          ts,sep='\t')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/spearman.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+'''Rank correlation processing for a csv tabulation of counts by segment 
+   First column is for whole crawl, then 100 columns for segs 0-99
+   Each row is counts for some property, e.g. mime-detected or tld
+
+   For example, assuming all.tsv has the whole-crawl warc-only counts
+   and s...tsv have the segment counts, all with counts in column 1,
+
+   tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+
+   will produce such a file with
+     * 100 rows, one for each of the top 100 counts
+     * 101 columns, 0 for all and 1--100 for segs 0--99
+
+   Usage: python3 -i spearman.py name id
+     where name.csv has the input
+'''
+     
+import numpy as np
+from numpy import loadtxt
+from scipy import stats
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+import pylab
+
+import sys, math
+
+def qqa():
+  # q-q plot for the whole crawl
+  sm.qqplot(all, line='s')
+  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
+  plt.show()
+
+def qqs():
+  # q-q plots for the best and worst (by variance) segments
+  global xv, xworst, xbest
+  xv=[d.variance for d in xd]
+  xworst=xv.index(max(xv))
+  xbest=xv.index(min(xv))
+  print(xbest,xworst)
+  sm.qqplot(x[xbest], line='s')
+  plt.gca().set_title('Best segment (least variance): %s'%xbest)
+  plt.show()
+  sm.qqplot(x[xworst], line='s')
+  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+  plt.show()
+
+def plot_x(sort=False,block=True,all_only=True,title=None):
+  # Make these two subplots, w. and w/o sorting
+  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+  #  for legend hacking
+  if sort:
+    aso=np.argsort(-all)
+    plot_all=all[aso]
+    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
+  else:
+    plot_all=all
+    plot_x=[xd[i].mean for i in range(N)]
+  if title is None:
+    l1='Rank correlation of segment x whole crawl'
+    l2='Mean of segment x whole crawl'
+    plt.legend(loc='best',fontsize='small')
+  else:
+    l1=l2=None
+  plt.plot(plot_all,'rx',label=l1)
+  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
+  if not(all_only):
+    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
+    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
+  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
+  plt.grid(True)
+  if title is not None:
+    plt.title(title)
+  plt.show(block=block)
+
+def hist_x(align='mid'):
+  hist(xm,xsd,[xd[i].mean for i in range(N)],
+       'Mean of rank correlation of each segment x all other segments',
+       align)
+
+def hist_all(align='mid'):
+  hist(all_m,np.sqrt(all_s.variance),all,
+       'Rank correlation of each segment x whole crawl %s'%id,
+       align)
+
+def hist(m,sd,hh,title,align):
+  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
+  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+  sdax=hax.twiny()
+  hax.hist(hh,color='lightblue',align=align)
+  hax.set_title(title)
+  for s,v in sdd:
+       sdax.plot([v,v],[0,18],'b')
+  sdax.set_xlim(hax.get_xlim())
+  sdax.set_ylim(hax.get_ylim())
+  sdax.set_xticks([v for s,v in sdd])
+  sdax.set_xticklabels([str(s) for s,v in sdd])
+  plt.show()
+
+def ci(rho,n,conf=0.95):
+  # Courtesy of https://stats.stackexchange.com/a/18904
+  # rho is (rank) correlation, n is sample size
+  stderr=1.0/math.sqrt(n-3)
+  z=stats.norm.ppf(1.0-((1.0-conf)/2))
+  delta=z*stderr
+  lower=math.tanh(math.atanh(rho)-delta)
+  upper=math.tanh(math.atanh(rho)+delta)
+  return (lower,upper)
+
+def plot_ci(rhos,n,trim=None,conf=0.95):
+   # rhos are (rank) correlation values
+   rhos_s=rhos[(-rhos).argsort()]
+   if trim is None:
+     l=len(rhos)
+   else:
+     rhos_s=rhos_s[:trim]
+     l=trim
+   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+   ue=cc[1]-rhos_s
+   le=rhos_s-cc[0]
+   #for i in range(len(rhos)):
+     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
+   plt.show()
+
+def first_diff(ranks):
+  # first disagreement with baseline == {1,2,...}
+  for i in range(len(ranks)):
+    if ranks[i]!=i+1.0:
+      return i
+  return i+1
+
+def ranks():
+  # Combine segment measures:
+  #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
+  # convert to ranks, smallest value == highest rank
+  all_ranked=stats.rankdata(-all,method='average') # invert since
+                                                   #  large corr is good
+  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
+                                                  # small corr variance is good
+  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
+                                                   # invert since
+                                                   #  large mean corr is good
+  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
+                                                   # invert since
+                                                   #  large first diff is good
+  return np.array([[i,
+                    all_ranked[i],
+                    x_variance_ranked[i],
+                    x_mean_ranked[i],
+                    fd_ranked[i]] for i in range(N)])
+
+def main():
+  global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
+  global aa, aa_by_all, N
+  counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
+  id=sys.argv[2]
+  N=counts.shape[1]-1
+  # "If axis=0 (default), then each column represents a variable, with
+  #        observations in the rows"
+  # So each column is a sequence of counts, for whole crawl in column 0
+  #   and for segments 0--N-1 in columns 1--N
+  corr=stats.spearmanr(counts,nan_policy='omit').correlation
+
+  all=corr[0][1:]
+  all_s=stats.describe(all)
+  all_m=all_s.mean
+
+  x=np.array([np.concatenate((corr[i][1:i],
+                              corr[i][i+1:])) for i in range(1,N+1)])
+  # The above, although transposed, works because the correlation matrix
+  #  is symmetric
+  xd=[stats.describe(x[i]) for i in range(N)]
+  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
+  xm=xs.mean
+  xsd=np.sqrt(xs.variance)
+
+  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
+
+  aa=ranks()
+  aa_by_all=aa[aa[:,1].argsort()]
+
+### I need to review rows, e.g. counts[0] is an array of N+1 counts
+###   for the most common label in the complete crawl,
+###   from the complete crawl and all the segments
+### versus columns, e.g. counts[:,0] is an array of N decreasing counts
+###   for all the labels in the complete crawl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/test_warc.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,33 @@
+import warc,sys
+
+OUT=open(sys.stdout.fileno(),'wb')
+
+if (debug:=(sys.argv[1]=='-d')):
+  sys.argv.pop(1)
+
+tt=int(sys.argv.pop(1))
+
+def showme(wtype,buf,part):
+  # This should exactly reproduce a complete warc file if called
+  #  as per version 1 below
+  if debug:
+    OUT.write(b"----start %d-----\n"%part)
+  OUT.write(buf)
+  if buf[-1]!=10:
+    OUT.write(b'\r\n')
+  if part==7:
+    OUT.write(b'\r\n') # to match complete file formatting
+  if debug:
+    OUT.write(b"----end %d-----\n"%part)
+  return OUT
+
+if tt==1:
+  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug)
+elif tt==2:
+  warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug)
+elif tt==3:
+  warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug)
+elif tt==4:
+  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug)
+elif tt==5:
+  warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/warc.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+'''Stream a warc format file, unzipping if necessary, invoking a
+callback on each record.  Callback can be limited by WARC-Type, record
+part'''
+
+import sys,io
+from isal import igzip
+
+RESP = b'response'
+REQ = b'request'
+META = b'metadata'
+INFO = b'warcinfo'
+
+def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
+  '''parts is a bit-mask:
+     1 for warc header;
+     2 for req/resp HTTP header, warcinfo/metadata features;
+     4 for req/resp body'''
+  # should do some sanity checking wrt parts and types
+  types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
+  nb=0
+  if filename.endswith(".gz"):
+    stream=igzip.IGzipFile(filename=filename)
+  else:
+    stream=open(filename,'rb',0)
+  bufSize=2*1024*1024
+  hdrMax=16*1024
+  buf=bytearray(bufSize)
+  bufView=memoryview(buf)
+  fpos=bl=stream.readinto(buf)
+  bp=0
+  done=False
+  while True:
+    while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty
+      bp+=2
+    start_1=bp
+    if not buf.startswith(b'WARC/1.0\r\n',bp):
+      if done and bl-bp==0:
+        # really done
+        return
+      raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename,
+                                                                   bp,bl,fpos,
+         (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
+                                                                     bl-bp))
+    bp+=10
+    wtype=None
+    length=None
+    state=1
+    tr=None # Was this record truncated?
+    while not buf.startswith(b'\r\n',bp):
+      # there should always be enough in the buffer to complete this loop,
+      #  because of the buffer update logic below
+      eol=buf.index(b'\r\n',bp)+2
+      if buf.startswith(b"Content-Length: ",bp):
+        length=wl=int(bufView[bp+16:eol-2])
+      elif buf.startswith(b"WARC-Truncated: ",bp):
+        if bp+16==eol-2:
+          tr=b"EMPTY"
+        else:
+          tr=bytes(bufView[bp+16:eol-2])
+      elif buf.startswith(b'WARC-Type: ',bp):
+        if buf.startswith(b's',bp+13):
+          wtype = RESP
+        elif buf.startswith(b'q',bp+13):
+          wtype = REQ
+        elif buf.startswith(b'm',bp+11):
+          wtype = META
+        elif buf.startswith(b'w',bp+11):
+          wtype = INFO
+        else:
+          raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
+                             bytes(bufView[bp+11:eol-2]),filename,
+                             fpos-(bl-bp)))
+      bp=eol
+    bp=eol+2
+    if done:
+      if (bp+length)>bl:
+        raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
+                         length,bl,filename))
+    elif (bp+(length+hdrMax))>bl:
+      # Need more data
+      if wtype in types:
+        # we need to keep from start_1 to bl
+        keepFrom=start_1
+        keepLen=bl-keepFrom
+        buf[0:keepLen]=bufView[keepFrom:bl]
+        eol=eol-start_1
+        start_1=0
+        bp=eol+2
+      else:
+        # we can skip the rest of this part
+        if (bp+length)<=bl:
+          # we have at least some bytes from the next part
+          keepLen=bl-(bp+length)
+          buf[0:keepLen]=bufView[bl-keepLen:bl]
+        else:
+          # we don't have all of the bytes from the current part
+          #  so can skip the rest of it
+          keepLen=0
+          fpos=stream.seek(fpos+(bp+length-bl))
+        bp=0
+      spaceToFill=bufSize-keepLen
+      with memoryview(buf)[keepLen:bufSize] as xBuf:
+        nb=stream.readinto(xBuf)
+      fpos+=nb
+      bl=keepLen+nb
+      if nb<spaceToFill:
+        done=True
+      if wtype not in types:
+        continue
+    if (wtype in types):
+      # Output whole or part 1 as required
+      if whole:
+        bp+=length
+        OUT=callback(wtype,bufView[start_1:bp],7)
+        continue
+      elif (parts & 1):
+        OUT=callback(wtype,bufView[start_1:eol],1)
+      if parts!=1:
+        while buf.startswith(b'\r\n',bp):
+          bp+=2
+        start_2=bp
+        eob=bp+length
+        while buf.startswith(b'\r\n',eob-2):
+          eob-=2
+        # Only output parts (2 = HTTP header, 4 = body) that are wanted
+        if parts & 2:
+          if wtype is META or wtype is INFO:
+            # rest of the part
+            OUT=callback(wtype,bufView[start_2:eob],2)
+          else:
+            # request and response have http headers
+            eo2=buf.index(b'\r\n\r\n',start_2)
+            OUT=callback(wtype,bufView[start_2:eo2+2],2)
+        if parts & 4:
+          for L in rec_text:
+            if state==2:
+              # HTTP header
+              wl -= len(L)
+              if not (L==b"" or L.startswith(b"\r")):
+                # Non-empty, it's (a continuation of) a header
+                if bl is None and L.startswith(b"Content-Length: "):
+                  bl=int(L[16:].rstrip())
+              else:
+                # Blank line, HTTP header is finished
+                if parts & 2:
+                  callback(wtype,bufView[start_2:start_2+L_start],2)
+                state=4
+                # The above is just for sanity, because we do _not_
+                #  continue with the outer loop,
+                #  since we can now block-output the entire rest of the
+                #  input buffer.
+                if bl is not None:
+                  if bl!=wl:
+                    print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                          (length,offset,filename,wl,bl,tr),file=sys.stderr)
+                # HTTP body
+                balance=start_2+rec_text.tell()
+                #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+                # Output whatever is left
+                if parts & 4:
+                  callback(wtype,bufView[balance:balance+wl],4)
+                state=1
+
+              L_start=rec_text.tell()
+    bp+=length
+    #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/deltas.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,33 @@
+#!/usr/bin/python3
+'''Extract and tabulate runtimes per file from a slurm output log'''
+import sys, re
+from datetime import datetime
+pending={}
+first=None
+SPAT=re.compile('... (.*) BST start ([0-9]+ [0-9]+)')
+EPAT=re.compile('... (.*) BST end ([0-9]+ [0-9]+)')
+with open(sys.argv[1],'r') as f:
+  for l in f:
+    if m:=SPAT.match(l):
+      b=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p")
+      id=m[2]
+      if id in pending:
+        print('%s started twice at %s, %s'%(id,pending[id],b),file=sys.stderr)
+      else:
+        pending[id]=b
+        if first is None:
+          first=b
+    elif m:=EPAT.match(l):
+      e=datetime.strptime(m[1],"%d %b %Y %I:%M:%S %p")
+      id=m[2]
+      if id in pending:
+        delta=(e-pending[id]).seconds
+        print(delta,"%2d:%02d"%(delta/60,delta%60),sep='\t')
+        del pending[id]
+      else:
+        print('%s ended w/o start at %s'%(id,e),file=sys.stderr)
+w=(e-first).seconds
+sys.stdout.flush()
+print('From %s to %s:'%(first.strftime("%d %b %Y %I:%M:%S %p"),
+                        e.strftime("%d %b %Y %I:%M:%S %p")),file=sys.stderr)
+print(' %d:%02d:%02d'%(w/3600,(w/60)%60,w%60),(e-first).seconds,sep='\t',file=sys.stderr)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/plinks.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import sys,pdfx,traceback,os
+from datetime import datetime
+
+def run(file):
+  try:
+    pdf=pdfx.PDFx(file)
+    links=pdf.get_references_as_dict()
+    if bool(links) and (links.get('scrape',False) or
+                        links.get('annot',False)):
+      for k in links.keys():
+        for l in links[k]:
+          print("%s\t%s"%(k,l))
+    else:
+      print("None")
+  except Exception as e:
+    if str(e)=='Unexpected EOF':
+      print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
+      print("badpdf")
+    else:
+      print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
+      traceback.print_exc(file=sys.stderr)
+
+if sys.argv[1]=='-':
+  i=0
+  for l in sys.stdin:
+    print(i,file=sys.stderr)
+    i+=1
+    f=l.rstrip()
+    if os.path.getsize(f)==1048576: # truncated
+      print("truncated",file=sys.stderr)
+      print("truncated")
+    else:
+      run(f)
+    os.unlink(f)
+else:
+  run(sys.argv[1])