view bin/merge_date.py @ 91:460f0599e8cd

mostly working, but need to reorder in case of cfid and friends
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 05 Sep 2023 17:32:46 +0100
parents c1a70532444c
children e56a7aad9ce9
line wrap: on
line source

#!/usr/bin/python3
'''Add timestamps from Last-Modified-dated (ks.tsv) files into
   that year's index

Usage: merge_date.py ksvstream cdx-dir outdir

ksvstream consists of tab-separated key, CC date and Unix timestamp
''' # '

import sys, io, os, os.path, time, re
from isal import igzip

if sys.argv[1] == '-d':
  sys.argv.pop(1)
  DEBUG = True
else:
  DEBUG = False

XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]

RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
b'(crawldiagnostics|robotstxt)/')
SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
                     b'sid|jsessionid|aspsessionid[a-z]*)'
                     b'=[^&]*)')

# Above based on this from fixed Java code:
#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",

#print(sys.argv[3],NPATH,file=sys.stderr)

os.makedirs(sys.argv[3], exist_ok=True)

FN = 0

XCNT = 0
dcnt = 0

XF = igzip.IGzipFile(filename=XPATH%0)
NF = open(NN:=(NPATH%0),'wb')

def nextLine():
  '''Move on to next index file if current has run out'''
  global FN, NF, NPATH, NN, XF, XPATH, XCNT
  while True:
    xl=XF.readline()
    XCNT += 1
    if xl == b'':
      # need to move to next index file
      FN += 1
      XF.close()
      NF.close()
      print(NN, flush=True) # so we can compress it
      time.sleep(0.1) # so it flushes?
      XN=XPATH%FN
      if not os.path.exists(XN):
        return
      XF = igzip.IGzipFile(filename=XN)
      NF = open((NN:=NPATH%FN), 'wb')
      xl = XF.readline()
      XCNT = 1
    if RorDPAT.search(xl):
      #print(xl,file=sys.stderr)
      continue
    return xl

def keys(key):
  '''Deal with failure of 2019-35-vintage Java fixup to detect
     parameter-part-initial session ids'''
  if m:=SESSION.match(key):
    prefix=m[1]
    e, b = m.span(2)
    fixed=key[:e]+key[b:]
    if fixed==m[1]:
      return prefix[:-1], None
    else:
      return prefix, fixed
  else:
    return key, None

with open(sys.argv[1], 'rb') as df:
  dl = df.readline()
  dcnt += 1
  dkey, ddate, dtime = dl.split(b'\t')

  while (xl:=nextLine()) is not None:
    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
    xkey1, xkey2 = keys(xkey)
    if (ddate != xdate or
        not dkey.startswith(xkey1) or
        (xkey2 is not None and dkey!=xkey2)):
      if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')):
        print("Fail: xkey: %s\n"
              "      dkey: %s\n"
              "      xdate: %s\n"
              "      ddate: %s\n"
              "k1, k2: |%s|%s|\n"
              "FN: %s\n"
              "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, xl),
              file=sys.stderr)
        raise ValueError()
      NF.write(xl)
      continue
    NF.write(xkey)
    NF.write(b' ')
    NF.write(xdate)
    NF.write(b' ')
    NF.write(xprops[:-2])
    NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
    dl = df.readline()
    if dl == '':
      # write out the last of the last index file, if any
      dkey = ddate = None
    else:
      dcnt += 1
      dkey, ddate, dtime = dl.split(b'\t')