changeset 113:4a52585a1aac

refactor datestream reading, fix pattern ordering in SESSION
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 26 Sep 2023 09:03:47 +0100
parents 827eadc72122
children 5818d79c4ec9
files bin/merge_date.py
diffstat 1 files changed, 35 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/bin/merge_date.py	Mon Sep 25 23:53:13 2023 +0100
+++ b/bin/merge_date.py	Tue Sep 26 09:03:47 2023 +0100
@@ -4,7 +4,7 @@
 
 Usage: merge_date.py ksvstream cdx-dir outdir
 
-ksvstream consists of tab-separated key, CC date and Unix timestamp
+ksvstream consists of tab-separated key, CC date, url and Unix timestamp
 ''' # '
 
 import sys, io, os, os.path, time, re
@@ -22,7 +22,7 @@
 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
 b'(crawldiagnostics|robotstxt)/')
 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
-                     b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
+                     b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)'
                      b'=[^&]*)')
 ISESSION = re.compile(SESSION.pattern,flags=re.I)
 URL=re.compile(b'\{"url": "([^"]*)"')
@@ -85,16 +85,28 @@
   else:
     return False, key, None
 
-dfq = [] # for reordering if needed
+DFQ = [] # for reordering if needed
 messyD = False
 
+def nextDate(df,dn):
+  global DEBUG, DFQ, DCNT, ISESSION
+  dl = df.readline()
+  if dl == b'':
+    if DFQ:
+      if DEBUG:
+        raise ValueError("EOF but non-empty DFQ: %s"%DFQ)
+    # write out the last of the last index file, if any
+    return "", "", "", 0, False
+  if DEBUG>1:
+    sys.stderr.write("dl%s: %s\n"%(dn,dl))
+  dkey, ddate, durl, dtime = dl.split(b'\t')
+  messyD = ISESSION.search(durl)
+  DCNT += 1
+  return dkey, ddate, durl, dtime, messyD
+
 with open(sys.argv[1], 'rb') as df:
-  dl = df.readline()
-  DCNT = 1
-  if DEBUG>1:
-    sys.stderr.write("dl1: %s"%dl.decode('ascii'))
-  dkey, ddate, durl, dtime = dl.split(b'\t')
-  messyD = ISESSION.search(durl)
+  DCNT = 0
+  dkey, ddate, durl, dtime, messyD = nextDate(df,1)
 
   xq = []
 
@@ -116,19 +128,20 @@
       if messyU:
         # better match
         if noMatch:
-          raise ValueError("Fail1: md: %s mu: %s\n"
+          print("Fail1: md: %s mu: %s\n"
                 "      xkey: %s\n"
                 "      dkey: %s\n"
                 "      xdate: %s\n"
                 "      ddate: %s\n"
                 "      xurl: %s\n"
                 "      durl: %s\n"
-                "dfq: %s\n"
+                "DFQ: %s\n"
                 "k1, k2: |%s|%s|\n"
                 "FN: %s XCNT: %s DCNT: %s\n"
                 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
-                          (b'\n     '.join(dfq)).decode('ascii'),
-                          xkey1, xkey2, FN, XCNT, DCNT, xl))
+                          (b'\n     '.join(DFQ)).decode('ascii'),
+                          xkey1, xkey2, FN, XCNT, DCNT, xl),
+                file=sys.stderr)
         # fall through to the ordinary (non-messy) match case
       else:
         # still looking, save if >= date else fall through to write
@@ -156,20 +169,23 @@
         durl!=xurl):
       if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
         
-        raise ValueError("Fail2: md: %s mu: %s\n"
+        print("Fail2: md: %s mu: %s\n"
                "      xkey: %s\n"
                "      dkey: %s\n"
                "      xdate: %s\n"
                "      ddate: %s\n"
                "      xurl: %s\n"
                "      durl: %s\n"
-               "dfq: %s\n"
+               "DFQ: %s\n"
                "k1, k2: |%s|%s|\n"
                "FN: %s XCNT: %s DCNT: %s\n"
                "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
                          xurl, durl,
-                         (b'\n     '.join(dfq)).decode('ascii'),
-                         xkey1, xkey2, FN, XCNT, DCNT, xl))
+                         (b'\n     '.join(DFQ)).decode('ascii'),
+                         xkey1, xkey2, FN, XCNT, DCNT, xl),
+              file=sys.stderr)
+        # try to force recovery
+        dkey, ddate, durl, dtime, messyD = nextDate(df,3)
       NF.write(xl)
       if DEBUG>1:
         sys.stderr.write("out_nl\n")
@@ -185,16 +201,5 @@
       sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii')
                                            for xp in (xkey, xdate, xurl))))
       sys.stderr.write(" %d\n"%int(dtime[:-3]))
-    dl = df.readline()
-    if dl == '':
-      if dfq:
-        if DEBUG:
-          raise ValueError
-      # write out the last of the last index file, if any
-      dkey = ddate = durl = ""
-    else:
-      if DEBUG>1:
-        sys.stderr.write("dl3: %s"%dl.decode('ascii'))
-      DCNT += 1
-      dkey, ddate, durl, dtime = dl.split(b'\t')
-      messyD = ISESSION.search(durl)
+    
+    dkey, ddate, durl, dtime, messyD = nextDate(df,2)