changeset 84:c18c307cc325

merge, including pointless fix wrt pq
author Henry Thompson <ht@markup.co.uk>
date Mon, 21 Aug 2023 13:06:20 -0400
parents d92bd8527718 (current diff) 120d90b47d74 (diff)
children 1daa8e444cfe
files bin/sort_date.py
diffstat 1 files changed, 11 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/bin/lmh_warc.py	Sat Aug 19 16:33:23 2023 -0400
+++ b/bin/lmh_warc.py	Mon Aug 21 13:06:20 2023 -0400
@@ -2,22 +2,32 @@
 
 import re,warc,sys
 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
 
+DTAB=bytearray(range(256))
+DDEL=b'TZ-:'
+
 OUT=open(sys.stdout.fileno(),'wb')
 
 def showmeLMH(wtype,buf,part):
-  global URI
+  global URI, DATE
   if part==1:
     if (m:=TUPAT.search(buf)):
       URI=m[1]
     else:
       raise ValueError(b"No target URI in %s ??"%buf)
+    if (md:=DPAT.search(buf)):
+      DATE=md[1]
+    else:
+      raise ValueError(b"No date in %s ??"%buf)
   else:
     mm=LMPAT.search(buf)
     OUT.write(URI)
     if mm:
       OUT.write(b'\t')
+      OUT.write(DATE.translate(DTAB,DDEL))
+      OUT.write(b'\t')
       OUT.write(mm[1])
     OUT.write(b'\n')