Mercurial > hg > cc > cirrus_work
changeset 46:44d3a4f4ea51
support on-board unzipping, reduce buffer size to 2MB
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 19:32:36 +0100 |
parents | 212da3fe3b19 |
children | b59f49909bda |
files | bin/warc.py |
diffstat | 1 files changed, 9 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Wed Jul 05 19:32:02 2023 +0100 +++ b/bin/warc.py Wed Jul 05 19:32:36 2023 +0100 @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -'''Stream a gzipped warc format file, invoking a callback on each record. -Callback can be limited by WARC-Type, record part''' +'''Stream a warc format file, unzipping if necessary, invoking a +callback on each record. Callback can be limited by WARC-Type, record +part''' import sys,io from isal import igzip @@ -8,9 +9,12 @@ def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 - stream=open(filename,'rb',0) - bufsize=128*1024*1024 - buf=bytearray(128*1024*1024) + if filename.endswith(".gz"): + stream=igzip.IGzipFile(filename=filename) + else: + stream=open(filename,'rb',0) + bufsize=2*1024*1024 + buf=bytearray(bufsize) l=b'\r\n' while not stream.closed: bp=0