changeset 55:11a886a84a49

finds multiples
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 10 Jul 2023 18:17:35 +0100
parents 9c63039a9b6d
children f8c8f79b2532
files bin/lmh_warc.py
diffstat 1 files changed, 9 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/lmh_warc.py	Fri Jul 07 19:30:23 2023 +0100
+++ b/bin/lmh_warc.py	Mon Jul 10 18:17:35 2023 +0100
@@ -1,7 +1,9 @@
-import re
+import re,swarc,sys
 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
 
+OUT=open(sys.stdout.fileno(),'wb')
+
 def showmeLMH(wtype,buf,part):
   global URI
   if part==1:
@@ -10,12 +12,13 @@
     else:
       raise ValueError(b"No target URI in %s ??"%buf)
   else:
-    m=LMPAT.search(buf)
+    mm=LMPAT.findall(buf)
     OUT.write(URI)
-    if m:
-      OUT.write(b'\t')
-      OUT.write(m[1])
+    if mm:
+      for m in mm:
+        OUT.write(b'\t')
+        OUT.write(m)
     OUT.write(b'\n')
 
-warc(showmeLMH,[b'response'],parts=3)
+swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)