changeset 112:827eadc72122

more faithful regexps and non-byte uri output
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 25 Sep 2023 23:53:13 +0100
parents ab3d547f3e76
children 4a52585a1aac
files bin/sort_date.py
diffstat 1 files changed, 7 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Fri Sep 22 15:27:28 2023 +0100
+++ b/bin/sort_date.py	Mon Sep 25 23:53:13 2023 +0100
@@ -75,11 +75,11 @@
 
 # Hack this to reproduce the Java bug
 surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
-    re.compile(b"(.*&)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.*&)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.*[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.*&)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
-    re.compile(b"(.*&)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
     ]
 
 # Above based on this from broken Java code:
@@ -120,6 +120,6 @@
   ctk=cmp_to_key(locale.strcoll)
   for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                    key=lambda x:x[0]):
-    print(key[0],key[1],key[2].encode('ascii',errors='java_unicode'),
+    print(key[0],key[1],
+          key[2].encode('ascii',errors='java_unicode').decode('ascii'),
           ts,sep='\t')
-