Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 46:44d3a4f4ea51
support on-board unzipping, reduce buffer size to 2MB
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 19:32:36 +0100 |
parents | 69be1131bcc5 |
children | d0d2fd9830d6 |
comparison
equal
deleted
inserted
replaced
45:212da3fe3b19 | 46:44d3a4f4ea51 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 '''Stream a gzipped warc format file, invoking a callback on each record. | 2 '''Stream a warc format file, unzipping if necessary, invoking a |
3 Callback can be limited by WARC-Type, record part''' | 3 callback on each record. Callback can be limited by WARC-Type, record |
4 part''' | |
4 | 5 |
5 import sys,io | 6 import sys,io |
6 from isal import igzip | 7 from isal import igzip |
7 | 8 |
8 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): | 9 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): |
9 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] | 10 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] |
10 nb=0 | 11 nb=0 |
11 stream=open(filename,'rb',0) | 12 if filename.endswith(".gz"): |
12 bufsize=128*1024*1024 | 13 stream=igzip.IGzipFile(filename=filename) |
13 buf=bytearray(128*1024*1024) | 14 else: |
15 stream=open(filename,'rb',0) | |
16 bufsize=2*1024*1024 | |
17 buf=bytearray(bufsize) | |
14 l=b'\r\n' | 18 l=b'\r\n' |
15 while not stream.closed: | 19 while not stream.closed: |
16 bp=0 | 20 bp=0 |
17 while l==b'\r\n': | 21 while l==b'\r\n': |
18 l=stream.readline() | 22 l=stream.readline() |