# HG changeset patch # User Henry S. Thompson # Date 1741473074 0 # Node ID 0ec17b2aab72f8ee318a378c840315998f3197ed # Parent e461601592dddd822c13333f39dd226bd8f20b6b fix GMT fix, %-encode utf8-bytes (which probably will open the door to all the other hacks in sort-date :-( diff -r e461601592dd -r 0ec17b2aab72 lib/python/cc/lmh/warc2cdb.py --- a/lib/python/cc/lmh/warc2cdb.py Fri Mar 07 21:17:47 2025 +0000 +++ b/lib/python/cc/lmh/warc2cdb.py Sat Mar 08 22:31:14 2025 +0000 @@ -6,6 +6,7 @@ import re,warc,sys,glob,codecs,os.path import cython, typing import email.utils +from urllib.parse import quote TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) @@ -37,7 +38,7 @@ if mm: dateTime=mm[1] if dateTime.endswith(b'GMT'): - dateTime = FFPAT.sub('\\1 GMT',dateTime) + dateTime = FFPAT.sub(b'\\1 GMT',dateTime) try: try: lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp()) @@ -47,6 +48,11 @@ print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return DATE=(DATE.translate(DTAB,DDEL)) + try: + URI.decode('ascii') + except UnicodeDecodeError: + URI=quote(URI, safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + ).encode('ascii') OUT.write(b'+') OUT.write(b'%d'%(len(DATE)+len(URI))) OUT.write(b',')