# HG changeset patch # User Henry S. Thompson # Date 1696351432 -3600 # Node ID 170844e519874bb0d1c96abc3da919ef73aeb0ed # Parent ec79bb4ccd7414c8a3b04117f489c3f273fe8b37 loosen WARC pattern to avoid failure from "mime" = "{...}" intervening diff -r ec79bb4ccd74 -r 170844e51987 lib/python/cc/lmh/merge_date.py --- a/lib/python/cc/lmh/merge_date.py Mon Oct 02 18:56:50 2023 +0100 +++ b/lib/python/cc/lmh/merge_date.py Tue Oct 03 17:43:52 2023 +0100 @@ -35,7 +35,7 @@ b'=[^&]*)') ISESSION = re.compile(SESSION.pattern,flags=re.I) URL=re.compile(b'\{"url": "([^"]*)"') -WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') +WARC=re.compile(b' \{.*", "filename": "([^/]*/){4}warc/') # Above based on this from broken Java code: # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 @@ -133,7 +133,7 @@ else: NF.write(xl) if DEBUG: - sys.stderr.write("out_rc\n") + sys.stderr.write("out_rc\n%s\n"%xl) def nextDate(dn):