changeset 139:e96d444b0f84

fixed bug(s) wrt large payload files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 23 Jul 2021 22:19:15 +0000
parents 9ea12f7b304b
children 0a447db5cf1c
files bin/warc.py
diffstat 1 files changed, 14 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Fri Jul 23 16:23:46 2021 +0000
+++ b/bin/warc.py	Fri Jul 23 22:19:15 2021 +0000
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
 '''Stream a warc format file, invoking a callback on each part.
 Callback can be limited by WARC-Type'''
-import sys
+import sys,os
 
 def warc(callback,types=['response']):
   nb=0
-  stream=open(sys.stdin.fileno(),'rb',0)
+  stream=open(sys.argv[1],'rb',0)
   bufsize=128*1024*1024
   buf=bytearray(128*1024*1024)
   l=b'\r\n'
@@ -14,7 +14,7 @@
       l=stream.readline()
       nb+=len(l)
     if l!=b'WARC/1.0\r\n':
-      if l==0:
+      if l==b'':
         return
       raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l),
                                                          l.decode('latin-1'),len(l)))
@@ -39,7 +39,7 @@
       raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
     nb+=length
     if wtype in types:
-      callback(wtype,bv)
+      callback(wtype,memoryview(buf[:length]))
   if whole and options.zipped:
     _output(bv)
     return
@@ -111,7 +111,16 @@
 
 OUT=open(sys.stdout.fileno(),'wb')
 
+import re
+LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+
+def showmeLMH(wtype,buf):
+  m=LMPAT.search(buf.tobytes(order='A'))
+  if m:
+    OUT.write(m[1])
+  OUT.write(b'\n')
+
 def showme(wtype,buf):
   OUT.write(buf)
 
-warc(showme,[b'metadata'])
+warc(showmeLMH,[b'response'])