changeset 285:0ec17b2aab72 default tip

fix GMT fix, %-encode utf8-bytes (which probably will open the door to all the other hacks in sort-date :-(
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 08 Mar 2025 22:31:14 +0000
parents e461601592dd
children
files lib/python/cc/lmh/warc2cdb.py
diffstat 1 files changed, 7 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py	Fri Mar 07 21:17:47 2025 +0000
+++ b/lib/python/cc/lmh/warc2cdb.py	Sat Mar 08 22:31:14 2025 +0000
@@ -6,6 +6,7 @@
 import re,warc,sys,glob,codecs,os.path
 import cython, typing
 import email.utils
+from urllib.parse import quote
 
 TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
 DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
@@ -37,7 +38,7 @@
     if mm:
       dateTime=mm[1]
       if dateTime.endswith(b'GMT'):
-        dateTime = FFPAT.sub('\\1 GMT',dateTime)
+        dateTime = FFPAT.sub(b'\\1 GMT',dateTime)
       try:
         try:
           lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp())
@@ -47,6 +48,11 @@
         print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
         return
       DATE=(DATE.translate(DTAB,DDEL))
+      try:
+        URI.decode('ascii')
+      except UnicodeDecodeError:
+        URI=quote(URI, safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #')
+                  ).encode('ascii')
       OUT.write(b'+')
       OUT.write(b'%d'%(len(DATE)+len(URI)))
       OUT.write(b',')