Mercurial > hg > cc > cirrus_work
changeset 140:d8b134f6ab03
fix a bad fix and a bad test for the televida case
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sat, 30 Sep 2023 18:04:15 +0100 |
parents | bb3ca6c5a037 |
children | 14d3802112b2 |
files | lib/python/cc/lmh/sort_date.py lib/python/cc/lmh/test/key_tests.tsv lib/python/cc/lmh/test/test_keys.py |
diffstat | 3 files changed, 8 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py Sat Sep 30 14:13:19 2023 +0100 +++ b/lib/python/cc/lmh/sort_date.py Sat Sep 30 18:04:15 2023 +0100 @@ -154,7 +154,7 @@ # <change> path = escapeOnce(OGU.unescapeRepeatedly(path), safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') - ).replace(b'\x7f',b'\\x7f') + ) # Wrt \x7f (DEL), see "biz,televida)" case ) # It remains to be seen whether other non-printing bytes # will need to be handled, which would require a regexp @@ -185,7 +185,7 @@ #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", def cdx_key(uristring): - if '\\' in uristring: + if '\\u' in uristring: uristring=bytes(uristring,'utf-8').decode('unicode_escape') _surt = surt.surt(uristring, canonicalizer=fixGoogleCanon) return WPAT.sub(')',_surt)
--- a/lib/python/cc/lmh/test/key_tests.tsv Sat Sep 30 14:13:19 2023 +0100 +++ b/lib/python/cc/lmh/test/key_tests.tsv Sat Sep 30 18:04:15 2023 +0100 @@ -2,9 +2,10 @@ x7f in query http://futnsz.cn/?tag=www.%7Fy19799.com cn,futnsz)/?tag=www.y19799.com Non-final PHPSESSIONID http://www.ccmmi.com.br/busca?busca=http%3A%2F%2Fvtservices85.fr%2Fsmf2%2Findex.php%3FPHPSESSID%3Dfa69565f8acb1f23da02e6729eef3643%26topic%3D126838.0&tipo=All br,com,ccmmi)/busca?busca=http://vtservices85.fr/smf2/index.php?topic=126838.0&tipo=all Final PHPSESSIONID in quoted URI https://www.eu-kommunal-kompass.de/index.php/foerderdatenbank?option=com_redirectpage&view=redirectpage&url=http%3A%2F%2Fwww.umweltbildung.de%2Ffileadmin%2Finhalte-projekte%2FKommune%2FANU-BNE_im_komm._KS_Abschlussbericht_pblc1b.pdf%3FPHPSESSID%3Da23dac7de24bb2f1e53978474b0d9c2d de,eu-kommunal-kompass)/index.php/foerderdatenbank?option=com_redirectpage&url=http://www.umweltbildung.de/fileadmin/inhalte-projekte/kommune/anu-bne_im_komm._ks_abschlussbericht_pblc1b.pdf?&view=redirectpage -x7f raw in path http://www.televida.biz/en/customers/media/177-%7F5th-anniversary-%E2%80%9Cviva-la-ma%C3%B1ana%E2%80%9D-from-guatevision-channel.html biz,televida)/en/customers/media/177-\x7f5th-anniversary-%e2%80%9cviva-la-ma%c3%b1ana%e2%80%9d-from-guatevision-channel.html +# ^? in key on next line is a _real_ DEL == \x7f character +x7f raw in path http://www.televida.biz/en/customers/media/177-%7F5th-anniversary-%E2%80%9Cviva-la-ma%C3%B1ana%E2%80%9D-from-guatevision-channel.html biz,televida)/en/customers/media/177-5th-anniversary-%e2%80%9cviva-la-ma%c3%b1ana%e2%80%9d-from-guatevision-channel.html Not octal IPv4 http://143.107.225.017/ 17,225,107,143)/ -# [duplicate deleted] +## [duplicate deleted] %25 in query http://223.223.160.154/ngt/livecam/index.php?login=+%C4%B9%CC%E7%A1%A6%BB%B0%B6%F9%A1%A1%A5%E9%A5%A4%A5%D6%A5%AB%A5%E1%A5%E9 154,160,223,223)/ngt/livecam/index.php?login=+%c4%b9%25cc%e7%a1%a6%25bb%25b0%25b6%25f9%25a1%25a1%25a5%e9%a5%a4%25a5%d6%a5%25ab%25a5%25e1%25a5%25e9 %25 in path http://www.55yq.cn/www.dm190.com/list/%25CE%25FC%D1%AA%25B9%25ED-new--.html cn,55yq)/www.dm190.com/list/%25ce%25fc%d1%aa%25b9%25ed-new--.html CFID last http://173.161.106.217/Accounts/Login.cfm?ts=%7Bts%20'2018-12-07%2001:22:30'%7D&CFID=81256&CFTOKEN=e2e985379a7dd979-4B757A99-E22D-6CFF-6FD928437D3F2BA3 217,106,161,173)/accounts/login.cfm?&ts={ts%20'2018-12-07%2001:22:30'}
--- a/lib/python/cc/lmh/test/test_keys.py Sat Sep 30 14:13:19 2023 +0100 +++ b/lib/python/cc/lmh/test/test_keys.py Sat Sep 30 18:04:15 2023 +0100 @@ -9,12 +9,11 @@ 'key_tests.tsv'),'r') as f: i = 0 for l in f: - if l[0] == '\n': + if l[0] in '#\n': + if l.startswith('##'): + i+=1 # use this to preserve numbering for a deletion continue i+=1 - if l[0] == '#': - # preserve numbering - continue d, u, k = l.rstrip().split('\t') kk = cdx_key(u) if kk == k: