diff bin/merge_date.py @ 107:40c460fed99f

working on sessionID pblms, still
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 17 Sep 2023 15:18:11 +0100
parents e606c609f813
children 52c6a9b0fc8c
line wrap: on
line diff
--- a/bin/merge_date.py	Thu Sep 14 19:27:23 2023 +0100
+++ b/bin/merge_date.py	Sun Sep 17 15:18:11 2023 +0100
@@ -27,12 +27,13 @@
 ISESSION = re.compile(SESSION.pattern,flags=re.I)
 URL=re.compile(b'\{"url": "([^"]*)"')
 
-# Above based on this from fixed Java code:
-#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
-#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
-#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
-#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
-#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
+# Above based on this from broken Java code:
+# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
+#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
+#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
+#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
+#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
+#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 #print(sys.argv[3],NPATH,file=sys.stderr)
 
@@ -112,12 +113,13 @@
     if messyD:
       noMatch = (ddate != xdate or
             not dkey.startswith(xkey1) or
-            dkey!=xkey1 or
+            (xkey2 is not None and dkey!=xkey2) or
             durl!=xurl)
       if messyU:
         # better match
         if noMatch:
-          raise ValueError("Fail: xkey: %s\n"
+          raise ValueError("Fail1: md: %s mu: %s\n"
+                "      xkey: %s\n"
                 "      dkey: %s\n"
                 "      xdate: %s\n"
                 "      ddate: %s\n"
@@ -126,7 +128,7 @@
                 "dfq: %s\n"
                 "k1, k2: |%s|%s|\n"
                 "FN: %s XCNT: %s DCNT: %s\n"
-                "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
+                "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
                           (b'\n     '.join(dfq)).decode('ascii'),
                           xkey1, xkey2, FN, XCNT, DCNT, xl))
         # fall through to the ordinary (non-messy) match case
@@ -156,14 +158,15 @@
         durl!=xurl):
       if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
         
-        raise ValueError("Fail: xkey: %s\n"
+        raise ValueError("Fail2: md: %s mu: %s\n"
+               "      xkey: %s\n"
                "      dkey: %s\n"
                "      xdate: %s\n"
                "      ddate: %s\n"
                "dfq: %s\n"
                "k1, k2: |%s|%s|\n"
                "FN: %s XCNT: %s DCNT: %s\n"
-               "xl: %s"%(xkey, dkey, xdate, ddate,
+               "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
                          (b'\n     '.join(dfq)).decode('ascii'),
                          xkey1, xkey2, FN, XCNT, DCNT, xl))
       NF.write(xl)