Mercurial > hg > cc > cirrus_work
changeset 285:0ec17b2aab72 default tip
fix GMT fix,
%-encode utf8-bytes (which probably will open the door to all the other hacks in sort-date :-(
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sat, 08 Mar 2025 22:31:14 +0000 |
parents | e461601592dd |
children | |
files | lib/python/cc/lmh/warc2cdb.py |
diffstat | 1 files changed, 7 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py Fri Mar 07 21:17:47 2025 +0000 +++ b/lib/python/cc/lmh/warc2cdb.py Sat Mar 08 22:31:14 2025 +0000 @@ -6,6 +6,7 @@ import re,warc,sys,glob,codecs,os.path import cython, typing import email.utils +from urllib.parse import quote TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) @@ -37,7 +38,7 @@ if mm: dateTime=mm[1] if dateTime.endswith(b'GMT'): - dateTime = FFPAT.sub('\\1 GMT',dateTime) + dateTime = FFPAT.sub(b'\\1 GMT',dateTime) try: try: lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp()) @@ -47,6 +48,11 @@ print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return DATE=(DATE.translate(DTAB,DDEL)) + try: + URI.decode('ascii') + except UnicodeDecodeError: + URI=quote(URI, safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + ).encode('ascii') OUT.write(b'+') OUT.write(b'%d'%(len(DATE)+len(URI))) OUT.write(b',')