comparison bin/warc.py @ 139:e96d444b0f84

fixed bug(s) wrt large payload files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 23 Jul 2021 22:19:15 +0000
parents 9ea12f7b304b
children d123ef7fdb82
comparison
equal deleted inserted replaced
138:9ea12f7b304b 139:e96d444b0f84
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Stream a warc format file, invoking a callback on each part. 2 '''Stream a warc format file, invoking a callback on each part.
3 Callback can be limited by WARC-Type''' 3 Callback can be limited by WARC-Type'''
4 import sys 4 import sys,os
5 5
6 def warc(callback,types=['response']): 6 def warc(callback,types=['response']):
7 nb=0 7 nb=0
8 stream=open(sys.stdin.fileno(),'rb',0) 8 stream=open(sys.argv[1],'rb',0)
9 bufsize=128*1024*1024 9 bufsize=128*1024*1024
10 buf=bytearray(128*1024*1024) 10 buf=bytearray(128*1024*1024)
11 l=b'\r\n' 11 l=b'\r\n'
12 while True: 12 while True:
13 while l==b'\r\n': 13 while l==b'\r\n':
14 l=stream.readline() 14 l=stream.readline()
15 nb+=len(l) 15 nb+=len(l)
16 if l!=b'WARC/1.0\r\n': 16 if l!=b'WARC/1.0\r\n':
17 if l==0: 17 if l==b'':
18 return 18 return
19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), 19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l),
20 l.decode('latin-1'),len(l))) 20 l.decode('latin-1'),len(l)))
21 wtype=None 21 wtype=None
22 length=None 22 length=None
37 bv=memoryview(buf)[ii:length] 37 bv=memoryview(buf)[ii:length]
38 if ii!=length: 38 if ii!=length:
39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) 39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
40 nb+=length 40 nb+=length
41 if wtype in types: 41 if wtype in types:
42 callback(wtype,bv) 42 callback(wtype,memoryview(buf[:length]))
43 if whole and options.zipped: 43 if whole and options.zipped:
44 _output(bv) 44 _output(bv)
45 return 45 return
46 gzip_chunk = io.BytesIO(bv) 46 gzip_chunk = io.BytesIO(bv)
47 uv=memoryview(buf)[length:] 47 uv=memoryview(buf)[length:]
109 _output(cb[balance:balance+wl]) 109 _output(cb[balance:balance+wl])
110 return 110 return
111 111
112 OUT=open(sys.stdout.fileno(),'wb') 112 OUT=open(sys.stdout.fileno(),'wb')
113 113
114 import re
115 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
116
117 def showmeLMH(wtype,buf):
118 m=LMPAT.search(buf.tobytes(order='A'))
119 if m:
120 OUT.write(m[1])
121 OUT.write(b'\n')
122
114 def showme(wtype,buf): 123 def showme(wtype,buf):
115 OUT.write(buf) 124 OUT.write(buf)
116 125
117 warc(showme,[b'metadata']) 126 warc(showmeLMH,[b'response'])