diff bin/lmh_warc.py @ 42:689a0e311cd2

make warc.py a library, separate out testing
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 15:37:16 +0100
parents
children 11a886a84a49
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/lmh_warc.py	Wed Jul 05 15:37:16 2023 +0100
@@ -0,0 +1,21 @@
+import re
+TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+
+def showmeLMH(wtype,buf,part):
+  global URI
+  if part==1:
+    if (m:=TUPAT.search(buf)):
+      URI=m[1]
+    else:
+      raise ValueError(b"No target URI in %s ??"%buf)
+  else:
+    m=LMPAT.search(buf)
+    OUT.write(URI)
+    if m:
+      OUT.write(b'\t')
+      OUT.write(m[1])
+    OUT.write(b'\n')
+
+warc(showmeLMH,[b'response'],parts=3)
+