view bin/merge_date.py @ 94:009e633eb804

last version before giving up on approach based only on key and datestamp
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 07 Sep 2023 18:03:55 +0100
parents 25bd398a8035
children 18446a7eeb9e
line wrap: on
line source

#!/usr/bin/python3
'''Add timestamps from Last-Modified-dated (ks.tsv) files into
   that year's index

Usage: merge_date.py ksvstream cdx-dir outdir

ksvstream consists of tab-separated key, CC date and Unix timestamp
''' # '

import sys, io, os, os.path, time, re
from isal import igzip

if sys.argv[1] == '-d':
  sys.argv.pop(1)
  DEBUG = True
else:
  DEBUG = False

XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]

RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
b'(crawldiagnostics|robotstxt)/')
SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
                     b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
                     b'=[^&]*)')

# Above based on this from fixed Java code:
#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",

#print(sys.argv[3],NPATH,file=sys.stderr)

os.makedirs(sys.argv[3], exist_ok=True)

FN = 0

XCNT = 0
DCNT = 0

XF = igzip.IGzipFile(filename=XPATH%0)
NF = open(NN:=(NPATH%0),'wb')

def nextLine():
  '''Move on to next index file if current has run out'''
  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
  while True:
    xl=XF.readline()
    XCNT += 1
    if xl == b'':
      # need to move to next index file
      FN += 1
      DCNT=0 # this is relative to FN
      XF.close()
      NF.close()
      print(NN, flush=True) # so we can compress it
      time.sleep(0.1) # so it flushes?
      XN=XPATH%FN
      if not os.path.exists(XN):
        return
      XF = igzip.IGzipFile(filename=XN)
      NF = open((NN:=NPATH%FN), 'wb')
      xl = XF.readline()
      XCNT = 1
    return xl

def keys(key):
  '''Deal with failure of 2019-35-vintage Java fixup to detect
     parameter-part-initial session ids'''
  if m:=SESSION.match(key):
    prefix=m[1]
    e, b = m.span(2)
    fixed=key[:e]+key[b:]
    if fixed==m[1]:
      return True, prefix[:-1], None
    else:
      return True, prefix, fixed
  else:
    return False, key, None

dfq = [] # for reordering if needed

with open(sys.argv[1], 'rb') as df:
  dl = df.readline()
  DCNT = 1
  dkey, ddate, dtime = dl.split(b'\t')

  while (xl:=nextLine()) is not None:
    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
    messy, xkey1, xkey2 = keys(xkey)
    if messy:
      stale=dfq
      dfq=[]
      while (dkey.startswith(xkey1) and
             (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
        dfq.append(dl)
        if stale:
          dl = stale.pop(0)
        else:
          dl = df.readline()
          DCNT += 1
        dkey, ddate, dtime = dl.split(b'\t')
    if (ddate != xdate or
        not dkey.startswith(xkey1) or
        (xkey2 is not None and dkey!=xkey2)):
      if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
        print("Fail: xkey: %s\n"
              "      dkey: %s\n"
              "      xdate: %s\n"
              "      ddate: %s\n"
              "dfq: %s\n"
              "k1, k2: |%s|%s|\n"
              "FN: %s XCNT: %s DCNT: %s\n"
              "xl: %s"%(xkey, dkey, xdate, ddate,
                        (b'\n     '.join(dfq)).decode('ascii'),
                        xkey1, xkey2, FN, XCNT, DCNT, xl),
              file=sys.stderr)
        raise ValueError
      NF.write(xl)
      continue
    NF.write(xkey)
    NF.write(b' ')
    NF.write(xdate)
    NF.write(b' ')
    NF.write(xprops[:-2])
    NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
    dl = df.readline()
    if dl == '':
      if dfq:
        if DEBUG:
          breakpoint()
      # write out the last of the last index file, if any
      dkey = ddate = ""
    else:
      DCNT += 1
      dkey, ddate, dtime = dl.split(b'\t')