Mercurial > hg > cc > cirrus_home
comparison bin/warc.py @ 139:e96d444b0f84
fixed bug(s) wrt large payload files
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 23 Jul 2021 22:19:15 +0000 |
parents | 9ea12f7b304b |
children | d123ef7fdb82 |
comparison
equal
deleted
inserted
replaced
138:9ea12f7b304b | 139:e96d444b0f84 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 '''Stream a warc format file, invoking a callback on each part. | 2 '''Stream a warc format file, invoking a callback on each part. |
3 Callback can be limited by WARC-Type''' | 3 Callback can be limited by WARC-Type''' |
4 import sys | 4 import sys,os |
5 | 5 |
6 def warc(callback,types=['response']): | 6 def warc(callback,types=['response']): |
7 nb=0 | 7 nb=0 |
8 stream=open(sys.stdin.fileno(),'rb',0) | 8 stream=open(sys.argv[1],'rb',0) |
9 bufsize=128*1024*1024 | 9 bufsize=128*1024*1024 |
10 buf=bytearray(128*1024*1024) | 10 buf=bytearray(128*1024*1024) |
11 l=b'\r\n' | 11 l=b'\r\n' |
12 while True: | 12 while True: |
13 while l==b'\r\n': | 13 while l==b'\r\n': |
14 l=stream.readline() | 14 l=stream.readline() |
15 nb+=len(l) | 15 nb+=len(l) |
16 if l!=b'WARC/1.0\r\n': | 16 if l!=b'WARC/1.0\r\n': |
17 if l==0: | 17 if l==b'': |
18 return | 18 return |
19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), | 19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), |
20 l.decode('latin-1'),len(l))) | 20 l.decode('latin-1'),len(l))) |
21 wtype=None | 21 wtype=None |
22 length=None | 22 length=None |
37 bv=memoryview(buf)[ii:length] | 37 bv=memoryview(buf)[ii:length] |
38 if ii!=length: | 38 if ii!=length: |
39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) | 39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) |
40 nb+=length | 40 nb+=length |
41 if wtype in types: | 41 if wtype in types: |
42 callback(wtype,bv) | 42 callback(wtype,memoryview(buf[:length])) |
43 if whole and options.zipped: | 43 if whole and options.zipped: |
44 _output(bv) | 44 _output(bv) |
45 return | 45 return |
46 gzip_chunk = io.BytesIO(bv) | 46 gzip_chunk = io.BytesIO(bv) |
47 uv=memoryview(buf)[length:] | 47 uv=memoryview(buf)[length:] |
109 _output(cb[balance:balance+wl]) | 109 _output(cb[balance:balance+wl]) |
110 return | 110 return |
111 | 111 |
112 OUT=open(sys.stdout.fileno(),'wb') | 112 OUT=open(sys.stdout.fileno(),'wb') |
113 | 113 |
114 import re | |
115 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) | |
116 | |
117 def showmeLMH(wtype,buf): | |
118 m=LMPAT.search(buf.tobytes(order='A')) | |
119 if m: | |
120 OUT.write(m[1]) | |
121 OUT.write(b'\n') | |
122 | |
114 def showme(wtype,buf): | 123 def showme(wtype,buf): |
115 OUT.write(buf) | 124 OUT.write(buf) |
116 | 125 |
117 warc(showme,[b'metadata']) | 126 warc(showmeLMH,[b'response']) |