# HG changeset patch # User Henry S. Thompson # Date 1694960291 -3600 # Node ID 40c460fed99f54ee195ce0c35df33dff8a28bf19 # Parent 6104acc1345b88ec10c48470a911b1f931d50b62 working on sessionID pblms, still diff -r 6104acc1345b -r 40c460fed99f bin/merge_date.py --- a/bin/merge_date.py Thu Sep 14 19:27:23 2023 +0100 +++ b/bin/merge_date.py Sun Sep 17 15:18:11 2023 +0100 @@ -27,12 +27,13 @@ ISESSION = re.compile(SESSION.pattern,flags=re.I) URL=re.compile(b'\{"url": "([^"]*)"') -# Above based on this from fixed Java code: -#(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), -#(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), -#(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), -#(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), -#(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", +# Above based on this from broken Java code: +# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 +#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", #print(sys.argv[3],NPATH,file=sys.stderr) @@ -112,12 +113,13 @@ if messyD: noMatch = (ddate != xdate or not dkey.startswith(xkey1) or - dkey!=xkey1 or + (xkey2 is not None and dkey!=xkey2) or durl!=xurl) if messyU: # better match if noMatch: - raise ValueError("Fail: xkey: %s\n" + raise ValueError("Fail1: md: %s mu: %s\n" + " xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" @@ -126,7 +128,7 @@ "dfq: %s\n" "k1, k2: |%s|%s|\n" "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, + "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, (b'\n '.join(dfq)).decode('ascii'), xkey1, xkey2, FN, XCNT, DCNT, xl)) # fall through to the ordinary (non-messy) match case @@ -156,14 +158,15 @@ durl!=xurl): if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): - raise ValueError("Fail: xkey: %s\n" + raise ValueError("Fail2: md: %s mu: %s\n" + " xkey: %s\n" " dkey: %s\n" " xdate: %s\n" " ddate: %s\n" "dfq: %s\n" "k1, k2: |%s|%s|\n" "FN: %s XCNT: %s DCNT: %s\n" - "xl: %s"%(xkey, dkey, xdate, ddate, + "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, (b'\n '.join(dfq)).decode('ascii'), xkey1, xkey2, FN, XCNT, DCNT, xl)) NF.write(xl) diff -r 6104acc1345b -r 40c460fed99f bin/sort_date.py --- a/bin/sort_date.py Thu Sep 14 19:27:23 2023 +0100 +++ b/bin/sort_date.py Sun Sep 17 15:18:11 2023 +0100 @@ -43,6 +43,7 @@ # And Java strips so-called option session-ids, but python doesn't import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer +import surt.URLRegexTransformer def notDefaultCanon(hu,**options): if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): @@ -56,10 +57,25 @@ hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) except ValueError: pass - # Either we don't hit any, or Java doesn't do path_strip_session_id - options.setdefault('query_strip_session_id',False) - return surt.DefaultIAURLCanonicalizer.canonicalize(hu, - **options) + + return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) + +# Hack this to reproduce the Java bug +surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ + re.compile(b"(.*&)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.*&)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.*&[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.*&)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), + re.compile(b"(.*&)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), + ] + +# Above based on this from broken Java code: +# https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 +#(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), +#(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), +#(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), +#(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), +#(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", def cdx_key(uristring): _surt = quote(unquote(surt.surt(uristring,