changeset 91:460f0599e8cd

mostly working, but need to reorder in case of cfid and friends
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 05 Sep 2023 17:32:46 +0100
parents c1a70532444c
children e56a7aad9ce9
files bin/merge_date.py
diffstat 1 files changed, 80 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/bin/merge_date.py	Thu Aug 31 14:14:21 2023 +0100
+++ b/bin/merge_date.py	Tue Sep 05 17:32:46 2023 +0100
@@ -7,11 +7,30 @@
 ksvstream consists of tab-separated key, CC date and Unix timestamp
 ''' # '
 
-import sys, io, os, os.path
+import sys, io, os, os.path, time, re
 from isal import igzip
 
+if sys.argv[1] == '-d':
+  sys.argv.pop(1)
+  DEBUG = True
+else:
+  DEBUG = False
+
 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
-NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3]
+NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
+
+RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
+b'(crawldiagnostics|robotstxt)/')
+SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
+                     b'sid|jsessionid|aspsessionid[a-z]*)'
+                     b'=[^&]*)')
+
+# Above based on this from fixed Java code:
+#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 #print(sys.argv[3],NPATH,file=sys.stderr)
 
@@ -19,35 +38,72 @@
 
 FN = 0
 
+XCNT = 0
+dcnt = 0
+
 XF = igzip.IGzipFile(filename=XPATH%0)
-NF = open(NPATH%0,'wb')
-
-XL = b''
+NF = open(NN:=(NPATH%0),'wb')
 
 def nextLine():
-  global FN, NF, NPATH, XF, XPATH
-  xl=XF.readline()
-  if xl == b'':
-    # need to move to next index file
-    if NF is None:
-      FN = 0
-    else:
+  '''Move on to next index file if current has run out'''
+  global FN, NF, NPATH, NN, XF, XPATH, XCNT
+  while True:
+    xl=XF.readline()
+    XCNT += 1
+    if xl == b'':
+      # need to move to next index file
       FN += 1
-    xn=XPATH%FN
-    if not os.path.exists(xn):
-      return
-    XF = igzip.IGzipFile(filename=xn)
-    NF = open(NPATH%FN, 'wb')
-    xl = XF.readline()
-  return xl
+      XF.close()
+      NF.close()
+      print(NN, flush=True) # so we can compress it
+      time.sleep(0.1) # so it flushes?
+      XN=XPATH%FN
+      if not os.path.exists(XN):
+        return
+      XF = igzip.IGzipFile(filename=XN)
+      NF = open((NN:=NPATH%FN), 'wb')
+      xl = XF.readline()
+      XCNT = 1
+    if RorDPAT.search(xl):
+      #print(xl,file=sys.stderr)
+      continue
+    return xl
+
+def keys(key):
+  '''Deal with failure of 2019-35-vintage Java fixup to detect
+     parameter-part-initial session ids'''
+  if m:=SESSION.match(key):
+    prefix=m[1]
+    e, b = m.span(2)
+    fixed=key[:e]+key[b:]
+    if fixed==m[1]:
+      return prefix[:-1], None
+    else:
+      return prefix, fixed
+  else:
+    return key, None
 
 with open(sys.argv[1], 'rb') as df:
   dl = df.readline()
-  (dkey, ddate, dtime) = dl.split(b'\t')
+  dcnt += 1
+  dkey, ddate, dtime = dl.split(b'\t')
 
   while (xl:=nextLine()) is not None:
-    (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
-    if dkey != xkey or ddate != xdate:
+    xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
+    xkey1, xkey2 = keys(xkey)
+    if (ddate != xdate or
+        not dkey.startswith(xkey1) or
+        (xkey2 is not None and dkey!=xkey2)):
+      if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')):
+        print("Fail: xkey: %s\n"
+              "      dkey: %s\n"
+              "      xdate: %s\n"
+              "      ddate: %s\n"
+              "k1, k2: |%s|%s|\n"
+              "FN: %s\n"
+              "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, xl),
+              file=sys.stderr)
+        raise ValueError()
       NF.write(xl)
       continue
     NF.write(xkey)
@@ -61,4 +117,5 @@
       # write out the last of the last index file, if any
       dkey = ddate = None
     else:
-      (dkey, ddate, dtime) = dl.split(b'\t')
+      dcnt += 1
+      dkey, ddate, dtime = dl.split(b'\t')