changeset 46:44d3a4f4ea51

support on-board unzipping, reduce buffer size to 2MB
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 19:32:36 +0100
parents 212da3fe3b19
children b59f49909bda
files bin/warc.py
diffstat 1 files changed, 9 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Wed Jul 05 19:32:02 2023 +0100
+++ b/bin/warc.py	Wed Jul 05 19:32:36 2023 +0100
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
-'''Stream a gzipped warc format file, invoking a callback on each record.
-Callback can be limited by WARC-Type, record part'''
+'''Stream a warc format file, unzipping if necessary, invoking a
+callback on each record.  Callback can be limited by WARC-Type, record
+part'''
 
 import sys,io
 from isal import igzip
@@ -8,9 +9,12 @@
 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
   types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
   nb=0
-  stream=open(filename,'rb',0)
-  bufsize=128*1024*1024
-  buf=bytearray(128*1024*1024)
+  if filename.endswith(".gz"):
+    stream=igzip.IGzipFile(filename=filename)
+  else:
+    stream=open(filename,'rb',0)
+  bufsize=2*1024*1024
+  buf=bytearray(bufsize)
   l=b'\r\n'
   while not stream.closed:
     bp=0