changeset 115:0b1e6e134aca

robotstxt and crawldiagnostics get free ride, get rid of DFQ and xq, big simplification and refactor as a result, fix bug in date stream eof handling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 26 Sep 2023 17:42:57 +0100
parents 5818d79c4ec9
children 5b952d16838c
files bin/merge_date.py
diffstat 1 files changed, 43 insertions(+), 106 deletions(-) [+]
line wrap: on
line diff
--- a/bin/merge_date.py	Tue Sep 26 14:18:40 2023 +0100
+++ b/bin/merge_date.py	Tue Sep 26 17:42:57 2023 +0100
@@ -26,6 +26,7 @@
                      b'=[^&]*)')
 ISESSION = re.compile(SESSION.pattern,flags=re.I)
 URL=re.compile(b'\{"url": "([^"]*)"')
+WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
 
 # Above based on this from broken Java code:
 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
@@ -47,11 +48,9 @@
 XF = igzip.IGzipFile(filename=XPATH%0)
 NF = open(NN:=(NPATH%0),'wb')
 
-def nextLine(xq, messyD):
+def nextLine():
   '''Move on to next index file if current has run out'''
   global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
-  if xq and not messyD:
-    return xq.pop(0), xq
   while True:
     xl=XF.readline()
     XCNT += 1
@@ -64,142 +63,80 @@
       time.sleep(0.1) # so it flushes?
       XN=XPATH%FN
       if not os.path.exists(XN):
-        return (None, None)
+        return None
       XF = igzip.IGzipFile(filename=XN)
       NF = open((NN:=NPATH%FN), 'wb')
       xl = XF.readline()
       XCNT = 1
-    return xl, xq
+    if WARC.search(xl):
+      return xl
+    else:
+      NF.write(xl)
+      if DEBUG:
+        sys.stderr.write("out_rc\n")
 
-def keys(key):
-  '''Deal with failure of 2019-35-vintage Java fixup to detect
-     parameter-part-initial session ids'''
-  if m:=SESSION.match(key):
-    prefix=m[1]
-    e, b = m.span(2)
-    fixed=key[:e]+key[b:]
-    if fixed==m[1]:
-      return True, prefix[:-1], None
-    else:
-      return True, prefix, fixed
-  else:
-    return False, key, None
-
-DFQ = [] # for reordering if needed
-messyD = False
 
 def nextDate(df,dn):
-  global DEBUG, DFQ, DCNT, ISESSION
+  global DEBUG, DCNT, XCNT
   dl = df.readline()
   if dl == b'':
-    if DFQ:
-      if DEBUG:
-        raise ValueError("EOF but non-empty DFQ: %s"%DFQ)
     # write out the last of the last index file, if any
-    return "", "", "", 0, False
-  if DEBUG>1:
+    return "", "", "", 0
+  if DEBUG:
     sys.stderr.write("dl%s: %s\n"%(dn,dl))
   dkey, ddate, durl, dtime = dl.split(b'\t')
-  messyD = ISESSION.search(durl)
   DCNT += 1
-  return dkey, ddate, durl, dtime, messyD
+  return dkey, ddate, durl, dtime
 
 with open(sys.argv[1], 'rb') as df:
   DCNT = 0
-  dkey, ddate, durl, dtime, messyD = nextDate(df,1)
 
-  xq = []
+  dkey, ddate, durl, dtime = nextDate(df,1)
 
-  while (nlRes := nextLine(xq, messyD))[0] is not None:
-    (xl, xq) = nlRes
+  while (xl := nextLine())[0] is not None:
     xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
     m = URL.match(xprops)
     if m:
       xurl = m[1]
     else:
       raise ValueError("No url in %s"%xprops)
-    if DEBUG>1:
+    if DEBUG:
       sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
                                             for xp in (xkey, xdate, xurl))))
-    messyU, xkey1, xkey2 = keys(xkey)
-    if messyD:
-      noMatch = (not dkey.startswith(xkey1) or
-            (xkey2 is not None and dkey!=xkey2))
-      if messyU:
-        # better match
-        if noMatch:
-          print("Fail1: md: %s mu: %s\n"
-                "      xkey: %s\n"
-                "      dkey: %s\n"
-                "      xdate: %s\n"
-                "      ddate: %s\n"
-                "      xurl: %s\n"
-                "      durl: %s\n"
-                "DFQ: %s\n"
-                "k1, k2: |%s|%s|\n"
-                "FN: %s XCNT: %s DCNT: %s\n"
-                "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
-                          (b'\n     '.join(DFQ)).decode('ascii'),
-                          xkey1, xkey2, FN, XCNT, DCNT, xl),
-                file=sys.stderr)
-        # fall through to the ordinary (non-messy) match case
-      else:
-        # still looking, save if >= date else fall through to write
-        if DEBUG>1:
-          print("Diso: match: %s\n"
-                "      xkey: %s\n"
-                "      dkey: %s\n"
-                "      xdate: %s\n"
-                "      ddate: %s\n"
-                "      xurl: %s\n"
-                "      durl: %s\n"
-                "xl: %s"%(not noMatch,
-                          xkey, dkey, xdate, ddate, xurl, durl, xl),
-                file=sys.stderr)
-        if (dkey.startswith(xkey1) and
-            (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
-          xq.append(xl)
-          if DEBUG>1:
-            sys.stderr.write('xpush\n')
-          continue
-        # else fall through
-    if (ddate != xdate or
-            not dkey.startswith(xkey1) or
-            (xkey2 is not None and dkey!=xkey2) or
-        durl!=xurl):
-      if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
-        
-        print("Fail2: md: %s mu: %s\n"
+    if dkey==xkey and ddate==xdate and durl==xurl:
+      # Got it
+      NF.write(xkey)
+      NF.write(b' ')
+      NF.write(xdate)
+      NF.write(b' ')
+      NF.write(xprops[:-2])
+      NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
+      if DEBUG:
+        sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
+                                             for xp in (xkey, xdate, xurl))))
+        sys.stderr.write(" %d\n"%int(dtime[:-3]))
+
+      dkey, ddate, durl, dtime = nextDate(df,2)
+      continue
+    else:
+      if dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
+        # we've missed something, disaster looms
+        print("Fail2:"
                "      xkey: %s\n"
                "      dkey: %s\n"
                "      xdate: %s\n"
                "      ddate: %s\n"
                "      xurl: %s\n"
                "      durl: %s\n"
-               "DFQ: %s\n"
-               "k1, k2: |%s|%s|\n"
                "FN: %s XCNT: %s DCNT: %s\n"
-               "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
+               "xl: %s"%(xkey, dkey, xdate, ddate,
                          xurl, durl,
-                         (b'\n     '.join(DFQ)).decode('ascii'),
-                         xkey1, xkey2, FN, XCNT, DCNT, xl),
+                         FN, XCNT, DCNT, xl),
               file=sys.stderr)
         # try to force recovery
-        dkey, ddate, durl, dtime, messyD = nextDate(df,3)
-      NF.write(xl)
-      if DEBUG>1:
-        sys.stderr.write("out_nl\n")
-      continue
-    # Got it
-    NF.write(xkey)
-    NF.write(b' ')
-    NF.write(xdate)
-    NF.write(b' ')
-    NF.write(xprops[:-2])
-    NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
-    if DEBUG>1:
-      sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
-                                           for xp in (xkey, xdate, xurl))))
-      sys.stderr.write(" %d\n"%int(dtime[:-3]))
-    
-    dkey, ddate, durl, dtime, messyD = nextDate(df,2)
+        dkey, ddate, durl, dtime = nextDate(df,3)
+        continue
+      # else fall through to write
+    NF.write(xl)
+    if DEBUG:
+      sys.stderr.write("out_nl\n")