changeset 244:ce5b2c1da222

working together works well to provide what's needed to update a cdx to include lastmod where possible
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 03 Oct 2024 18:17:55 +0100
parents 7bef91ca3d51
children 1d6fe71f13f4
files lib/python/cc/cdx_extras.py lib/python/cc/unpackz.py
diffstat 2 files changed, 36 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/cdx_extras.py	Thu Oct 03 18:17:55 2024 +0100
@@ -0,0 +1,27 @@
+#!/usr/bin/python3
+'''Output the necessary for updating a segment index file with lastmod'''
+import sys, re
+
+import unpackz
+
+PAT = re.compile(b'\r\nLast-Modified: ([^\r]*)',re.MULTILINE)
+
+def whereandwhat(count,offset,data,outfile):
+  '''Callback from unpackz to extract what we need'''
+  #if offset == 345053206:
+  #  breakpoint()
+  if data.startswith(b'WARC/1.0\r\nWARC-Type: response\r\n') and \
+         b'Last-Modified: ' in data:  # included for speed (worth factor of 3)
+    if (http_start := data.find(b'\r\nHTTP/')) == -1:
+      print("missing HTTP at %s %s"%(count,offset),
+            file=sys.stderr)
+      return
+    if (http_end := data.find(b'\r\n\r\n',http_start)) == -1:
+      print("missing end of HTTP at %s %s"%(count,offset),
+            file=sys.stderr)
+      return
+    if (m:=PAT.search(data,http_start,http_end)):
+      print(count,offset,m[1].decode('UTF-8'),sep='\t')
+      
+unpackz.unpackz(sys.argv[1],whereandwhat)
+
--- a/lib/python/cc/unpackz.py	Wed Oct 02 19:54:45 2024 +0100
+++ b/lib/python/cc/unpackz.py	Thu Oct 03 18:17:55 2024 +0100
@@ -13,18 +13,21 @@
   with open(infileName,'rb') as f:
     z = isal.isal_zlib.decompressobj(31)
     count = 0
-    got = None # Keep the compiler happy
+    prev_buf = buf = got = None # Keep the compiler happy
+    ogot = None
     while True:
       if z.unused_data == b"": 
-          #print('n', obuf_len, file=sys.stderr)
+        #print('n', obuf_len, file=sys.stderr)
+        if ogot is not None:
+          ogot = ogot + got
+        else:
+          ogot = got
         if lastbuf:  # buf == b"":
           callback(obuf_len, offset, got, outfile)
           if count!=0:
             print("Unused data: count=%s offset=%s ?"%(count, offset),
                   file=sys.stderr)
           break
-        if nbuf:
-          obuf_len += BUFSIZE # still no EOS after a full buffer processed
         buf = f.read(BUFSIZE)
         nbuf = True
         lastbuf = ((truesize:=len(buf)) < BUFSIZE) # will only succeed if now at EOF
@@ -36,7 +39,8 @@
                 (len(buf)-len(z.unused_data))
         #if (offset == 1352249):
         #  breakpoint()
-        callback(count, offset, got, outfile)
+        callback(count, offset, got if ogot is None else ogot + got, outfile)
+        ogot = None
         offset += count
         count = 0
         buf = z.unused_data