view bin/merge_date.py @ 115:0b1e6e134aca

robotstxt and crawldiagnostics get free ride, get rid of DFQ and xq, big simplification and refactor as a result, fix bug in date stream eof handling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 26 Sep 2023 17:42:57 +0100
parents 4a52585a1aac
children f52783faf3ee
line wrap: on
line source

#!/usr/bin/python3
'''Add timestamps from Last-Modified-dated (ks.tsv) files into
   that year's index

Usage: merge_date.py ksvstream cdx-dir outdir

ksvstream consists of tab-separated key, CC date, url and Unix timestamp
''' # '

import sys, io, os, os.path, time, re
from isal import igzip


DEBUG = 0
while sys.argv[1] == '-d':
  sys.argv.pop(1)
  DEBUG += 1  

XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]

RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
b'(crawldiagnostics|robotstxt)/')
SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
                     b'=[^&]*)')
ISESSION = re.compile(SESSION.pattern,flags=re.I)
URL=re.compile(b'\{"url": "([^"]*)"')
WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')

# Above based on this from broken Java code:
# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",

#print(sys.argv[3],NPATH,file=sys.stderr)

os.makedirs(sys.argv[3], exist_ok=True)

FN = 0

XCNT = 0
DCNT = 0

XF = igzip.IGzipFile(filename=XPATH%0)
NF = open(NN:=(NPATH%0),'wb')

def nextLine():
  '''Move on to next index file if current has run out'''
  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
  while True:
    xl=XF.readline()
    XCNT += 1
    if xl == b'':
      # need to move to next index file
      FN += 1
      XF.close()
      NF.close()
      print(NN, flush=True) # so we can compress it
      time.sleep(0.1) # so it flushes?
      XN=XPATH%FN
      if not os.path.exists(XN):
        return None
      XF = igzip.IGzipFile(filename=XN)
      NF = open((NN:=NPATH%FN), 'wb')
      xl = XF.readline()
      XCNT = 1
    if WARC.search(xl):
      return xl
    else:
      NF.write(xl)
      if DEBUG:
        sys.stderr.write("out_rc\n")


def nextDate(df,dn):
  global DEBUG, DCNT, XCNT
  dl = df.readline()
  if dl == b'':
    # write out the last of the last index file, if any
    return "", "", "", 0
  if DEBUG:
    sys.stderr.write("dl%s: %s\n"%(dn,dl))
  dkey, ddate, durl, dtime = dl.split(b'\t')
  DCNT += 1
  return dkey, ddate, durl, dtime

with open(sys.argv[1], 'rb') as df:
  DCNT = 0

  dkey, ddate, durl, dtime = nextDate(df,1)

  while (xl := nextLine())[0] is not None:
    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
    m = URL.match(xprops)
    if m:
      xurl = m[1]
    else:
      raise ValueError("No url in %s"%xprops)
    if DEBUG:
      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
                                            for xp in (xkey, xdate, xurl))))
    if dkey==xkey and ddate==xdate and durl==xurl:
      # Got it
      NF.write(xkey)
      NF.write(b' ')
      NF.write(xdate)
      NF.write(b' ')
      NF.write(xprops[:-2])
      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
      if DEBUG:
        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
                                             for xp in (xkey, xdate, xurl))))
        sys.stderr.write(" %d\n"%int(dtime[:-3]))

      dkey, ddate, durl, dtime = nextDate(df,2)
      continue
    else:
      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
        # we've missed something, disaster looms
        print("Fail2:"
               "      xkey: %s\n"
               "      dkey: %s\n"
               "      xdate: %s\n"
               "      ddate: %s\n"
               "      xurl: %s\n"
               "      durl: %s\n"
               "FN: %s XCNT: %s DCNT: %s\n"
               "xl: %s"%(xkey, dkey, xdate, ddate,
                         xurl, durl,
                         FN, XCNT, DCNT, xl),
              file=sys.stderr)
        # try to force recovery
        dkey, ddate, durl, dtime = nextDate(df,3)
        continue
      # else fall through to write
    NF.write(xl)
    if DEBUG:
      sys.stderr.write("out_nl\n")