changeset 140:d8b134f6ab03

fix a bad fix and a bad test for the televida case
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 30 Sep 2023 18:04:15 +0100
parents bb3ca6c5a037
children 14d3802112b2
files lib/python/cc/lmh/sort_date.py lib/python/cc/lmh/test/key_tests.tsv lib/python/cc/lmh/test/test_keys.py
diffstat 3 files changed, 8 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py	Sat Sep 30 14:13:19 2023 +0100
+++ b/lib/python/cc/lmh/sort_date.py	Sat Sep 30 18:04:15 2023 +0100
@@ -154,7 +154,7 @@
     # <change>
     path = escapeOnce(OGU.unescapeRepeatedly(path),
                       safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #')
-                      ).replace(b'\x7f',b'\\x7f')
+                      )
     # Wrt \x7f (DEL), see "biz,televida)" case                        )
     # It remains to be seen whether other non-printing bytes
     #  will need to be handled, which would require a regexp
@@ -185,7 +185,7 @@
 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 def cdx_key(uristring):
-  if '\\' in uristring:
+  if '\\u' in uristring:
     uristring=bytes(uristring,'utf-8').decode('unicode_escape')
   _surt = surt.surt(uristring, canonicalizer=fixGoogleCanon)
   return WPAT.sub(')',_surt)
--- a/lib/python/cc/lmh/test/key_tests.tsv	Sat Sep 30 14:13:19 2023 +0100
+++ b/lib/python/cc/lmh/test/key_tests.tsv	Sat Sep 30 18:04:15 2023 +0100
@@ -2,9 +2,10 @@
 x7f in query	http://futnsz.cn/?tag=www.%7Fy19799.com	cn,futnsz)/?tag=www.y19799.com
 Non-final PHPSESSIONID	http://www.ccmmi.com.br/busca?busca=http%3A%2F%2Fvtservices85.fr%2Fsmf2%2Findex.php%3FPHPSESSID%3Dfa69565f8acb1f23da02e6729eef3643%26topic%3D126838.0&tipo=All	br,com,ccmmi)/busca?busca=http://vtservices85.fr/smf2/index.php?topic=126838.0&tipo=all
 Final PHPSESSIONID in quoted URI	https://www.eu-kommunal-kompass.de/index.php/foerderdatenbank?option=com_redirectpage&view=redirectpage&url=http%3A%2F%2Fwww.umweltbildung.de%2Ffileadmin%2Finhalte-projekte%2FKommune%2FANU-BNE_im_komm._KS_Abschlussbericht_pblc1b.pdf%3FPHPSESSID%3Da23dac7de24bb2f1e53978474b0d9c2d	de,eu-kommunal-kompass)/index.php/foerderdatenbank?option=com_redirectpage&url=http://www.umweltbildung.de/fileadmin/inhalte-projekte/kommune/anu-bne_im_komm._ks_abschlussbericht_pblc1b.pdf?&view=redirectpage
-x7f raw in path	http://www.televida.biz/en/customers/media/177-%7F5th-anniversary-%E2%80%9Cviva-la-ma%C3%B1ana%E2%80%9D-from-guatevision-channel.html	biz,televida)/en/customers/media/177-\x7f5th-anniversary-%e2%80%9cviva-la-ma%c3%b1ana%e2%80%9d-from-guatevision-channel.html
+# ^? in key on next line is a _real_ DEL == \x7f character
+x7f raw in path	http://www.televida.biz/en/customers/media/177-%7F5th-anniversary-%E2%80%9Cviva-la-ma%C3%B1ana%E2%80%9D-from-guatevision-channel.html	biz,televida)/en/customers/media/177-5th-anniversary-%e2%80%9cviva-la-ma%c3%b1ana%e2%80%9d-from-guatevision-channel.html
 Not octal IPv4	http://143.107.225.017/	17,225,107,143)/
-# [duplicate deleted]
+## [duplicate deleted]
 %25 in query	http://223.223.160.154/ngt/livecam/index.php?login=+%C4%B9%CC%E7%A1%A6%BB%B0%B6%F9%A1%A1%A5%E9%A5%A4%A5%D6%A5%AB%A5%E1%A5%E9	154,160,223,223)/ngt/livecam/index.php?login=+%c4%b9%25cc%e7%a1%a6%25bb%25b0%25b6%25f9%25a1%25a1%25a5%e9%a5%a4%25a5%d6%a5%25ab%25a5%25e1%25a5%25e9
 %25 in path	http://www.55yq.cn/www.dm190.com/list/%25CE%25FC%D1%AA%25B9%25ED-new--.html	cn,55yq)/www.dm190.com/list/%25ce%25fc%d1%aa%25b9%25ed-new--.html
 CFID last	http://173.161.106.217/Accounts/Login.cfm?ts=%7Bts%20'2018-12-07%2001:22:30'%7D&CFID=81256&CFTOKEN=e2e985379a7dd979-4B757A99-E22D-6CFF-6FD928437D3F2BA3	217,106,161,173)/accounts/login.cfm?&ts={ts%20'2018-12-07%2001:22:30'}
--- a/lib/python/cc/lmh/test/test_keys.py	Sat Sep 30 14:13:19 2023 +0100
+++ b/lib/python/cc/lmh/test/test_keys.py	Sat Sep 30 18:04:15 2023 +0100
@@ -9,12 +9,11 @@
                        'key_tests.tsv'),'r') as f:
   i = 0
   for l in f:
-    if l[0] == '\n':
+    if l[0] in '#\n':
+      if l.startswith('##'):
+        i+=1 # use this to preserve numbering for a deletion
       continue
     i+=1
-    if l[0] == '#':
-      # preserve numbering
-      continue
     d, u, k = l.rstrip().split('\t')
     kk = cdx_key(u)
     if kk == k: