comparison bin/warc.py @ 42:689a0e311cd2

make warc.py a library, separate out testing
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 15:37:16 +0100
parents 8661062a50b1
children 69be1131bcc5
comparison
equal deleted inserted replaced
41:fa43c318749b 42:689a0e311cd2
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Stream a warc format file, invoking a callback on each record. 2 '''Stream a gzipped warc format file, invoking a callback on each record.
3 Callback can be limited by WARC-Type, record part''' 3 Callback can be limited by WARC-Type, record part'''
4 import sys,os,io
5 4
6 if (debug:=(sys.argv[1]=='-d')): 5 import sys,io
7 sys.argv.pop(1) 6 from isal import igzip
8 7
9 def warc(callback,types=['response'],whole=False,parts=7): 8 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
10 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] 9 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
11 nb=0 10 nb=0
12 stream=open(sys.argv[1],'rb',0) 11 stream=open(filename,'rb',0)
13 bufsize=128*1024*1024 12 bufsize=128*1024*1024
14 buf=bytearray(128*1024*1024) 13 buf=bytearray(128*1024*1024)
15 l=b'\r\n' 14 l=b'\r\n'
16 while not stream.closed: 15 while not stream.closed:
17 bp=0 16 bp=0
102 if parts & 4: 101 if parts & 4:
103 callback(wtype,buf[balance:balance+wl],4) 102 callback(wtype,buf[balance:balance+wl],4)
104 state=1 103 state=1
105 104
106 L_start=rec_text.tell() 105 L_start=rec_text.tell()
107 OUT=open(sys.stdout.fileno(),'wb')
108
109 import re
110 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
111 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
112
113 def showmeLMH(wtype,buf,part):
114 global URI
115 if part==1:
116 if (m:=TUPAT.search(buf)):
117 URI=m[1]
118 else:
119 raise ValueError(b"No target URI in %s ??"%buf)
120 else:
121 m=LMPAT.search(buf)
122 OUT.write(URI)
123 if m:
124 OUT.write(b'\t')
125 OUT.write(m[1])
126 OUT.write(b'\n')
127
128 def showme(wtype,buf,part):
129 if debug:
130 breakpoint()
131 OUT.write(b"%d\n%b"%(part,buf))
132
133 warc(showmeLMH,[b'response'],parts=3)
134
135 #warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
136
137 #warc(showme,[b'response'],parts=int(sys.argv[2]))
138 #warc(showme,[b'response'],whole=True)