diff bin/lmh_warc.py @ 64:b14187ccfb46

revert to just showing first LM
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 19 Jul 2023 13:19:42 +0100
parents 11a886a84a49
children 120d90b47d74
line wrap: on
line diff
--- a/bin/lmh_warc.py	Fri Jul 14 17:39:14 2023 +0100
+++ b/bin/lmh_warc.py	Wed Jul 19 13:19:42 2023 +0100
@@ -1,4 +1,6 @@
-import re,swarc,sys
+#!/usr/bin/env python3
+
+import re,warc,sys
 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
 
@@ -12,13 +14,12 @@
     else:
       raise ValueError(b"No target URI in %s ??"%buf)
   else:
-    mm=LMPAT.findall(buf)
+    mm=LMPAT.search(buf)
     OUT.write(URI)
     if mm:
-      for m in mm:
-        OUT.write(b'\t')
-        OUT.write(m)
+      OUT.write(b'\t')
+      OUT.write(mm[1])
     OUT.write(b'\n')
 
-swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)
+warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)