comparison bin/warc.py @ 46:44d3a4f4ea51

support on-board unzipping, reduce buffer size to 2MB
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 19:32:36 +0100
parents 69be1131bcc5
children d0d2fd9830d6
comparison
equal deleted inserted replaced
45:212da3fe3b19 46:44d3a4f4ea51
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Stream a gzipped warc format file, invoking a callback on each record. 2 '''Stream a warc format file, unzipping if necessary, invoking a
3 Callback can be limited by WARC-Type, record part''' 3 callback on each record. Callback can be limited by WARC-Type, record
4 part'''
4 5
5 import sys,io 6 import sys,io
6 from isal import igzip 7 from isal import igzip
7 8
8 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): 9 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
9 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] 10 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
10 nb=0 11 nb=0
11 stream=open(filename,'rb',0) 12 if filename.endswith(".gz"):
12 bufsize=128*1024*1024 13 stream=igzip.IGzipFile(filename=filename)
13 buf=bytearray(128*1024*1024) 14 else:
15 stream=open(filename,'rb',0)
16 bufsize=2*1024*1024
17 buf=bytearray(bufsize)
14 l=b'\r\n' 18 l=b'\r\n'
15 while not stream.closed: 19 while not stream.closed:
16 bp=0 20 bp=0
17 while l==b'\r\n': 21 while l==b'\r\n':
18 l=stream.readline() 22 l=stream.readline()