changeset 93:25bd398a8035

improve reordering, still failing on cdx-00004
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Sep 2023 18:51:21 +0100
parents e56a7aad9ce9
children 009e633eb804
files bin/merge_date.py
diffstat 1 files changed, 32 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/bin/merge_date.py	Tue Sep 05 17:33:29 2023 +0100
+++ b/bin/merge_date.py	Wed Sep 06 18:51:21 2023 +0100
@@ -22,7 +22,7 @@
 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
 b'(crawldiagnostics|robotstxt)/')
 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
-                     b'sid|jsessionid|aspsessionid[a-z]*)'
+                     b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
                      b'=[^&]*)')
 
 # Above based on this from fixed Java code:
@@ -39,20 +39,21 @@
 FN = 0
 
 XCNT = 0
-dcnt = 0
+DCNT = 0
 
 XF = igzip.IGzipFile(filename=XPATH%0)
 NF = open(NN:=(NPATH%0),'wb')
 
 def nextLine():
   '''Move on to next index file if current has run out'''
-  global FN, NF, NPATH, NN, XF, XPATH, XCNT
+  global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
   while True:
     xl=XF.readline()
     XCNT += 1
     if xl == b'':
       # need to move to next index file
       FN += 1
+      DCNT=0 # this is relative to FN
       XF.close()
       NF.close()
       print(NN, flush=True) # so we can compress it
@@ -64,9 +65,6 @@
       NF = open((NN:=NPATH%FN), 'wb')
       xl = XF.readline()
       XCNT = 1
-    if RorDPAT.search(xl):
-      #print(xl,file=sys.stderr)
-      continue
     return xl
 
 def keys(key):
@@ -77,44 +75,50 @@
     e, b = m.span(2)
     fixed=key[:e]+key[b:]
     if fixed==m[1]:
-      return prefix[:-1], None
+      return True, prefix[:-1], None
     else:
-      return prefix, fixed
+      return True, prefix, fixed
   else:
-    return key, None
+    return False, key, None
 
 dfq = [] # for reordering if needed
 
 with open(sys.argv[1], 'rb') as df:
-  if dfq:
-    dl = dfq.pop(0) 
-  else:
-    dl = df.readline()
-    dcnt += 1
+  dl = df.readline()
+  DCNT = 1
   dkey, ddate, dtime = dl.split(b'\t')
 
   while (xl:=nextLine()) is not None:
     xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
-    xkey1, xkey2 = keys(xkey)
-    if xkey2 is not None:
-      while dkey.startswith(xkey1) and dkey!=xkey2:
+    messy, xkey1, xkey2 = keys(xkey)
+    if messy:
+      stale=dfq
+      dfq=[]
+      while (dkey.startswith(xkey1) and
+             (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))):
         dfq.append(dl)
-        dl = df.readline()
-        dcnt += 1
+        if stale:
+          dl = stale.pop(0)
+        else:
+          dl = df.readline()
+          DCNT += 1
         dkey, ddate, dtime = dl.split(b'\t')
     if (ddate != xdate or
         not dkey.startswith(xkey1) or
         (xkey2 is not None and dkey!=xkey2)):
-      if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')):
+      if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
         print("Fail: xkey: %s\n"
               "      dkey: %s\n"
               "      xdate: %s\n"
               "      ddate: %s\n"
+              "dfq: %s\n"
               "k1, k2: |%s|%s|\n"
-              "FN: %s dcnt: %s\n"
-              "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, dcnt, xl),
+              "FN: %s XCNT: %s DCNT: %s\n"
+              "xl: %s"%(xkey, dkey, xdate, ddate,
+                        (b'\n     '.join(dfq)).decode('ascii'),
+                        xkey1, xkey2, FN, XCNT, DCNT, xl),
               file=sys.stderr)
-        raise ValueError()
+        breakpoint()
       NF.write(xl)
       continue
     NF.write(xkey)
@@ -125,8 +129,11 @@
     NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
     dl = df.readline()
     if dl == '':
+      if dfq:
+        if DEBUG:
+          breakpoint()
       # write out the last of the last index file, if any
-      dkey = ddate = None
+      dkey = ddate = ""
     else:
-      dcnt += 1
+      DCNT += 1
       dkey, ddate, dtime = dl.split(b'\t')