Mercurial > hg > cc > cirrus_work
changeset 145:170844e51987
loosen WARC pattern to avoid failure from "mime" = "{...}" intervening
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 03 Oct 2023 17:43:52 +0100 |
parents | ec79bb4ccd74 |
children | 83fbd652a014 |
files | lib/python/cc/lmh/merge_date.py |
diffstat | 1 files changed, 2 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/merge_date.py Mon Oct 02 18:56:50 2023 +0100 +++ b/lib/python/cc/lmh/merge_date.py Tue Oct 03 17:43:52 2023 +0100 @@ -35,7 +35,7 @@ b'=[^&]*)') ISESSION = re.compile(SESSION.pattern,flags=re.I) URL=re.compile(b'\{"url": "([^"]*)"') -WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') +WARC=re.compile(b' \{.*", "filename": "([^/]*/){4}warc/') # Above based on this from broken Java code: # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 @@ -133,7 +133,7 @@ else: NF.write(xl) if DEBUG: - sys.stderr.write("out_rc\n") + sys.stderr.write("out_rc\n%s\n"%xl) def nextDate(dn):