changeset 79:120d90b47d74

include timestamp
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 21 Aug 2023 13:37:07 +0100
parents fef49258d738
children c18c307cc325
files bin/lmh_warc.py
diffstat 1 files changed, 11 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/bin/lmh_warc.py	Sun Aug 20 00:28:43 2023 +0100
+++ b/bin/lmh_warc.py	Mon Aug 21 13:37:07 2023 +0100
@@ -2,22 +2,32 @@
 
 import re,warc,sys
 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
 
+DTAB=bytearray(range(256))
+DDEL=b'TZ-:'
+
 OUT=open(sys.stdout.fileno(),'wb')
 
 def showmeLMH(wtype,buf,part):
-  global URI
+  global URI, DATE
   if part==1:
     if (m:=TUPAT.search(buf)):
       URI=m[1]
     else:
       raise ValueError(b"No target URI in %s ??"%buf)
+    if (md:=DPAT.search(buf)):
+      DATE=md[1]
+    else:
+      raise ValueError(b"No date in %s ??"%buf)
   else:
     mm=LMPAT.search(buf)
     OUT.write(URI)
     if mm:
       OUT.write(b'\t')
+      OUT.write(DATE.translate(DTAB,DDEL))
+      OUT.write(b'\t')
       OUT.write(mm[1])
     OUT.write(b'\n')