diff bin/merge_date.py @ 100:18446a7eeb9e

rework handling of session key problem
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 11 Sep 2023 12:56:47 +0100
parents 009e633eb804
children e2e64c3d763e
line wrap: on
line diff
--- a/bin/merge_date.py	Fri Sep 08 21:40:52 2023 +0100
+++ b/bin/merge_date.py	Mon Sep 11 12:56:47 2023 +0100
@@ -10,11 +10,11 @@
 import sys, io, os, os.path, time, re
 from isal import igzip
 
-if sys.argv[1] == '-d':
+
+DEBUG = 0
+while sys.argv[1] == '-d':
   sys.argv.pop(1)
-  DEBUG = True
-else:
-  DEBUG = False
+  DEBUG += 1  
 
 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
@@ -24,6 +24,8 @@
 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
                      b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
                      b'=[^&]*)')
+ISESSION = re.compile(SESSION.pattern,flags=re.I)
+URL=re.compile(b'\{"url": "([^"]*)"')
 
 # Above based on this from fixed Java code:
 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
@@ -44,16 +46,17 @@
 XF = igzip.IGzipFile(filename=XPATH%0)
 NF = open(NN:=(NPATH%0),'wb')
 
-def nextLine():
+def nextLine(xq, messyD):
   '''Move on to next index file if current has run out'''
   global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
+  if xq and not messyD:
+    return xq.pop(0), xq
   while True:
     xl=XF.readline()
     XCNT += 1
     if xl == b'':
       # need to move to next index file
       FN += 1
-      DCNT=0 # this is relative to FN
       XF.close()
       NF.close()
       print(NN, flush=True) # so we can compress it
@@ -65,7 +68,7 @@
       NF = open((NN:=NPATH%FN), 'wb')
       xl = XF.readline()
       XCNT = 1
-    return xl
+    return xl, xq
 
 def keys(key):
   '''Deal with failure of 2019-35-vintage Java fixup to detect
@@ -82,58 +85,97 @@
     return False, key, None
 
 dfq = [] # for reordering if needed
+messyD = False
 
 with open(sys.argv[1], 'rb') as df:
   dl = df.readline()
   DCNT = 1
-  dkey, ddate, dtime = dl.split(b'\t')
+  if DEBUG>1:
+    sys.stderr.write("dl1: %s"%dl.decode('ascii'))
+  dkey, ddate, durl, dtime = dl.split(b'\t')
+  messyD = ISESSION.search(durl)
 
-  while (xl:=nextLine()) is not None:
+  xq = []
+
+  while (nlRes := nextLine(xq, messyD))[0] is not None:
+    (xl, xq) = nlRes
     xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
-    messy, xkey1, xkey2 = keys(xkey)
-    if messy:
-      stale=dfq
-      dfq=[]
-      while (dkey.startswith(xkey1) and
-             (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
-        dfq.append(dl)
-        if stale:
-          dl = stale.pop(0)
-        else:
-          dl = df.readline()
-          DCNT += 1
-        dkey, ddate, dtime = dl.split(b'\t')
+    m = URL.match(xprops)
+    if m:
+      xurl = m[1]
+    else:
+      raise ValueError("No url in %s"%xprops)
+    if DEBUG>1:
+      sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
+                                            for xp in (xkey, xdate, xurl))))
+    messyU, xkey1, xkey2 = keys(xkey)
+    if messyD:
+      if messyU:
+        # better match
+        if (ddate != xdate or
+            not dkey.startswith(xkey1) or
+            dkey!=xkey1 or
+            durl!=xurl):
+          raise ValueError("Fail: xkey: %s\n"
+                "      dkey: %s\n"
+                "      xdate: %s\n"
+                "      ddate: %s\n"
+                "      xurl: %s\n"
+                "      durl: %s\n"
+                "dfq: %s\n"
+                "k1, k2: |%s|%s|\n"
+                "FN: %s XCNT: %s DCNT: %s\n"
+                "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
+                          (b'\n     '.join(dfq)).decode('ascii'),
+                          xkey1, xkey2, FN, XCNT, DCNT, xl))
+        messyD = False
+        # fall through to the ordinary (non-messy) match case
+      else:
+        # still looking, save this one
+        if DEBUG:
+          print("Diso: xkey: %s\n"
+                "      dkey: %s\n"
+                "      xdate: %s\n"
+                "      ddate: %s\n"
+                "      xurl: %s\n"
+                "      durl: %s\n"
+                "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl),
+                file=sys.stderr)
+        xq.append(xl)
+        if DEBUG>1:
+          sys.stderr.write('xpush\n')
+        continue
+    else:
+      # Not messyD
+      if messyU:
+        raise ValueError("messyD w/o messyU")
     if (ddate != xdate or
-        not dkey.startswith(xkey1) or
-        (xkey2 is not None and dkey!=xkey2)):
-      if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
-        print("Fail: xkey: %s\n"
-              "      dkey: %s\n"
-              "      xdate: %s\n"
-              "      ddate: %s\n"
-              "dfq: %s\n"
-              "k1, k2: |%s|%s|\n"
-              "FN: %s XCNT: %s DCNT: %s\n"
-              "xl: %s"%(xkey, dkey, xdate, ddate,
-                        (b'\n     '.join(dfq)).decode('ascii'),
-                        xkey1, xkey2, FN, XCNT, DCNT, xl),
-              file=sys.stderr)
-        raise ValueError
+            not dkey.startswith(xkey1) or
+            (xkey2 is not None and dkey!=xkey2) or
+        durl!=xurl):
       NF.write(xl)
       continue
+    # Got it
     NF.write(xkey)
     NF.write(b' ')
     NF.write(xdate)
     NF.write(b' ')
     NF.write(xprops[:-2])
     NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
+    if DEBUG>1:
+      sys.stderr.write("out: %s"%(' '.join(xp.decode('ascii')
+                                           for xp in (xkey, xdate, xurl))))
+      sys.stderr.write(" %d\n"%int(dtime[:-3]))
     dl = df.readline()
     if dl == '':
       if dfq:
         if DEBUG:
-          breakpoint()
+          raise ValueError
       # write out the last of the last index file, if any
-      dkey = ddate = ""
+      dkey = ddate = durl = ""
     else:
+      if DEBUG>1:
+        sys.stderr.write("dl3: %s"%dl.decode('ascii'))
       DCNT += 1
-      dkey, ddate, dtime = dl.split(b'\t')
+      dkey, ddate, durl, dtime = dl.split(b'\t')
+      messyD = ISESSION.search(durl)