# HG changeset patch # User Henry S. Thompson # Date 1688581956 -3600 # Node ID 44d3a4f4ea51a9efd8b6735ee40d74658238c3d9 # Parent 212da3fe3b199d56c15793567b8051f330141e48 support on-board unzipping, reduce buffer size to 2MB diff -r 212da3fe3b19 -r 44d3a4f4ea51 bin/warc.py --- a/bin/warc.py Wed Jul 05 19:32:02 2023 +0100 +++ b/bin/warc.py Wed Jul 05 19:32:36 2023 +0100 @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -'''Stream a gzipped warc format file, invoking a callback on each record. -Callback can be limited by WARC-Type, record part''' +'''Stream a warc format file, unzipping if necessary, invoking a +callback on each record. Callback can be limited by WARC-Type, record +part''' import sys,io from isal import igzip @@ -8,9 +9,12 @@ def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 - stream=open(filename,'rb',0) - bufsize=128*1024*1024 - buf=bytearray(128*1024*1024) + if filename.endswith(".gz"): + stream=igzip.IGzipFile(filename=filename) + else: + stream=open(filename,'rb',0) + bufsize=2*1024*1024 + buf=bytearray(bufsize) l=b'\r\n' while not stream.closed: bp=0