changeset 107:40c460fed99f

working on sessionID pblms, still
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 17 Sep 2023 15:18:11 +0100
parents 6104acc1345b
children 9f7a35bf07f9
files bin/merge_date.py bin/sort_date.py
diffstat 2 files changed, 34 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/bin/merge_date.py	Thu Sep 14 19:27:23 2023 +0100
+++ b/bin/merge_date.py	Sun Sep 17 15:18:11 2023 +0100
@@ -27,12 +27,13 @@
 ISESSION = re.compile(SESSION.pattern,flags=re.I)
 URL=re.compile(b'\{"url": "([^"]*)"')
 
-# Above based on this from fixed Java code:
-#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
-#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
-#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
-#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
-#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 #print(sys.argv[3],NPATH,file=sys.stderr)
 
@@ -112,12 +113,13 @@
     if messyD:
       noMatch = (ddate != xdate or
             not dkey.startswith(xkey1) or
-            dkey!=xkey1 or
+            (xkey2 is not None and dkey!=xkey2) or
             durl!=xurl)
       if messyU:
         # better match
         if noMatch:
-          raise ValueError("Fail: xkey: %s\n"
+          raise ValueError("Fail1: md: %s mu: %s\n"
+                "      xkey: %s\n"
                 "      dkey: %s\n"
                 "      xdate: %s\n"
                 "      ddate: %s\n"
@@ -126,7 +128,7 @@
                 "dfq: %s\n"
                 "k1, k2: |%s|%s|\n"
                 "FN: %s XCNT: %s DCNT: %s\n"
-                "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
+                "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
                           (b'\n     '.join(dfq)).decode('ascii'),
                           xkey1, xkey2, FN, XCNT, DCNT, xl))
         # fall through to the ordinary (non-messy) match case
@@ -156,14 +158,15 @@
         durl!=xurl):
       if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
         
-        raise ValueError("Fail: xkey: %s\n"
+        raise ValueError("Fail2: md: %s mu: %s\n"
+               "      xkey: %s\n"
                "      dkey: %s\n"
                "      xdate: %s\n"
                "      ddate: %s\n"
                "dfq: %s\n"
                "k1, k2: |%s|%s|\n"
                "FN: %s XCNT: %s DCNT: %s\n"
-               "xl: %s"%(xkey, dkey, xdate, ddate,
+               "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
                          (b'\n     '.join(dfq)).decode('ascii'),
                          xkey1, xkey2, FN, XCNT, DCNT, xl))
       NF.write(xl)
--- a/bin/sort_date.py	Thu Sep 14 19:27:23 2023 +0100
+++ b/bin/sort_date.py	Sun Sep 17 15:18:11 2023 +0100
@@ -43,6 +43,7 @@
 # And Java strips so-called option session-ids, but python doesn't
 
 import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
+import surt.URLRegexTransformer
 
 def notDefaultCanon(hu,**options):
   if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
@@ -56,10 +57,25 @@
       hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
     except ValueError:
       pass
-  # Either we don't hit any, or Java doesn't do path_strip_session_id
-  options.setdefault('query_strip_session_id',False)
-  return surt.DefaultIAURLCanonicalizer.canonicalize(hu,
-                                                **options)
+
+  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
+
+# Hack this to reproduce the Java bug
+surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
+    re.compile(b"(.*&)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.*&)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.*&[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.*&)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile(b"(.*&)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    ]
+
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 def cdx_key(uristring):
   _surt = quote(unquote(surt.surt(uristring,