view bin/merge_date.py @ 110:a0ea1e4a714d

pass in debug flag(s) to merge_date.py
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:40:58 +0100
parents 52c6a9b0fc8c
children 4a52585a1aac
line wrap: on
line source

#!/usr/bin/python3
'''Add timestamps from Last-Modified-dated (ks.tsv) files into
   that year's index

Usage: merge_date.py ksvstream cdx-dir outdir

ksvstream consists of tab-separated key, CC date and Unix timestamp
''' # '

import sys, io, os, os.path, time, re
from isal import igzip


DEBUG = 0
while sys.argv[1] == '-d':
  sys.argv.pop(1)
  DEBUG += 1  

XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]

RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
b'(crawldiagnostics|robotstxt)/')
SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
                     b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
                     b'=[^&]*)')
ISESSION = re.compile(SESSION.pattern,flags=re.I)
URL=re.compile(b'\{"url": "([^"]*)"')

# Above based on this from broken Java code:
# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",

#print(sys.argv[3],NPATH,file=sys.stderr)

os.makedirs(sys.argv[3], exist_ok=True)

FN = 0

XCNT = 0
DCNT = 0

XF = igzip.IGzipFile(filename=XPATH%0)
NF = open(NN:=(NPATH%0),'wb')

def nextLine(xq, messyD):
  '''Move on to next index file if current has run out'''
  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
  if xq and not messyD:
    return xq.pop(0), xq
  while True:
    xl=XF.readline()
    XCNT += 1
    if xl == b'':
      # need to move to next index file
      FN += 1
      XF.close()
      NF.close()
      print(NN, flush=True) # so we can compress it
      time.sleep(0.1) # so it flushes?
      XN=XPATH%FN
      if not os.path.exists(XN):
        return (None, None)
      XF = igzip.IGzipFile(filename=XN)
      NF = open((NN:=NPATH%FN), 'wb')
      xl = XF.readline()
      XCNT = 1
    return xl, xq

def keys(key):
  '''Deal with failure of 2019-35-vintage Java fixup to detect
     parameter-part-initial session ids'''
  if m:=SESSION.match(key):
    prefix=m[1]
    e, b = m.span(2)
    fixed=key[:e]+key[b:]
    if fixed==m[1]:
      return True, prefix[:-1], None
    else:
      return True, prefix, fixed
  else:
    return False, key, None

dfq = [] # for reordering if needed
messyD = False

with open(sys.argv[1], 'rb') as df:
  dl = df.readline()
  DCNT = 1
  if DEBUG>1:
    sys.stderr.write("dl1: %s"%dl.decode('ascii'))
  dkey, ddate, durl, dtime = dl.split(b'\t')
  messyD = ISESSION.search(durl)

  xq = []

  while (nlRes := nextLine(xq, messyD))[0] is not None:
    (xl, xq) = nlRes
    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
    m = URL.match(xprops)
    if m:
      xurl = m[1]
    else:
      raise ValueError("No url in %s"%xprops)
    if DEBUG>1:
      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
                                            for xp in (xkey, xdate, xurl))))
    messyU, xkey1, xkey2 = keys(xkey)
    if messyD:
      noMatch = (not dkey.startswith(xkey1) or
            (xkey2 is not None and dkey!=xkey2))
      if messyU:
        # better match
        if noMatch:
          raise ValueError("Fail1: md: %s mu: %s\n"
                "      xkey: %s\n"
                "      dkey: %s\n"
                "      xdate: %s\n"
                "      ddate: %s\n"
                "      xurl: %s\n"
                "      durl: %s\n"
                "dfq: %s\n"
                "k1, k2: |%s|%s|\n"
                "FN: %s XCNT: %s DCNT: %s\n"
                "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
                          (b'\n     '.join(dfq)).decode('ascii'),
                          xkey1, xkey2, FN, XCNT, DCNT, xl))
        # fall through to the ordinary (non-messy) match case
      else:
        # still looking, save if >= date else fall through to write
        if DEBUG>1:
          print("Diso: match: %s\n"
                "      xkey: %s\n"
                "      dkey: %s\n"
                "      xdate: %s\n"
                "      ddate: %s\n"
                "      xurl: %s\n"
                "      durl: %s\n"
                "xl: %s"%(not noMatch,
                          xkey, dkey, xdate, ddate, xurl, durl, xl),
                file=sys.stderr)
        if (dkey.startswith(xkey1) and
            (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
          xq.append(xl)
          if DEBUG>1:
            sys.stderr.write('xpush\n')
          continue
        # else fall through
    if (ddate != xdate or
            not dkey.startswith(xkey1) or
            (xkey2 is not None and dkey!=xkey2) or
        durl!=xurl):
      if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
        
        raise ValueError("Fail2: md: %s mu: %s\n"
               "      xkey: %s\n"
               "      dkey: %s\n"
               "      xdate: %s\n"
               "      ddate: %s\n"
               "      xurl: %s\n"
               "      durl: %s\n"
               "dfq: %s\n"
               "k1, k2: |%s|%s|\n"
               "FN: %s XCNT: %s DCNT: %s\n"
               "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
                         xurl, durl,
                         (b'\n     '.join(dfq)).decode('ascii'),
                         xkey1, xkey2, FN, XCNT, DCNT, xl))
      NF.write(xl)
      if DEBUG>1:
        sys.stderr.write("out_nl\n")
      continue
    # Got it
    NF.write(xkey)
    NF.write(b' ')
    NF.write(xdate)
    NF.write(b' ')
    NF.write(xprops[:-2])
    NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
    if DEBUG>1:
      sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
                                           for xp in (xkey, xdate, xurl))))
      sys.stderr.write(" %d\n"%int(dtime[:-3]))
    dl = df.readline()
    if dl == '':
      if dfq:
        if DEBUG:
          raise ValueError
      # write out the last of the last index file, if any
      dkey = ddate = durl = ""
    else:
      if DEBUG>1:
        sys.stderr.write("dl3: %s"%dl.decode('ascii'))
      DCNT += 1
      dkey, ddate, durl, dtime = dl.split(b'\t')
      messyD = ISESSION.search(durl)