changeset 145:170844e51987

loosen WARC pattern to avoid failure from "mime" = "{...}" intervening
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 03 Oct 2023 17:43:52 +0100
parents ec79bb4ccd74
children 83fbd652a014
files lib/python/cc/lmh/merge_date.py
diffstat 1 files changed, 2 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/merge_date.py	Mon Oct 02 18:56:50 2023 +0100
+++ b/lib/python/cc/lmh/merge_date.py	Tue Oct 03 17:43:52 2023 +0100
@@ -35,7 +35,7 @@
                      b'=[^&]*)')
 ISESSION = re.compile(SESSION.pattern,flags=re.I)
 URL=re.compile(b'\{"url": "([^"]*)"')
-WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/')
+WARC=re.compile(b' \{.*", "filename": "([^/]*/){4}warc/')
 
 # Above based on this from broken Java code:
 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
@@ -133,7 +133,7 @@
     else:
       NF.write(xl)
       if DEBUG:
-        sys.stderr.write("out_rc\n")
+        sys.stderr.write("out_rc\n%s\n"%xl)
 
 
 def nextDate(dn):