Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 42:689a0e311cd2
make warc.py a library, separate out testing
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 15:37:16 +0100 |
parents | 8661062a50b1 |
children | 69be1131bcc5 |
comparison
equal
deleted
inserted
replaced
41:fa43c318749b | 42:689a0e311cd2 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 '''Stream a warc format file, invoking a callback on each record. | 2 '''Stream a gzipped warc format file, invoking a callback on each record. |
3 Callback can be limited by WARC-Type, record part''' | 3 Callback can be limited by WARC-Type, record part''' |
4 import sys,os,io | |
5 | 4 |
6 if (debug:=(sys.argv[1]=='-d')): | 5 import sys,io |
7 sys.argv.pop(1) | 6 from isal import igzip |
8 | 7 |
9 def warc(callback,types=['response'],whole=False,parts=7): | 8 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): |
10 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] | 9 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] |
11 nb=0 | 10 nb=0 |
12 stream=open(sys.argv[1],'rb',0) | 11 stream=open(filename,'rb',0) |
13 bufsize=128*1024*1024 | 12 bufsize=128*1024*1024 |
14 buf=bytearray(128*1024*1024) | 13 buf=bytearray(128*1024*1024) |
15 l=b'\r\n' | 14 l=b'\r\n' |
16 while not stream.closed: | 15 while not stream.closed: |
17 bp=0 | 16 bp=0 |
102 if parts & 4: | 101 if parts & 4: |
103 callback(wtype,buf[balance:balance+wl],4) | 102 callback(wtype,buf[balance:balance+wl],4) |
104 state=1 | 103 state=1 |
105 | 104 |
106 L_start=rec_text.tell() | 105 L_start=rec_text.tell() |
107 OUT=open(sys.stdout.fileno(),'wb') | |
108 | |
109 import re | |
110 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) | |
111 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) | |
112 | |
113 def showmeLMH(wtype,buf,part): | |
114 global URI | |
115 if part==1: | |
116 if (m:=TUPAT.search(buf)): | |
117 URI=m[1] | |
118 else: | |
119 raise ValueError(b"No target URI in %s ??"%buf) | |
120 else: | |
121 m=LMPAT.search(buf) | |
122 OUT.write(URI) | |
123 if m: | |
124 OUT.write(b'\t') | |
125 OUT.write(m[1]) | |
126 OUT.write(b'\n') | |
127 | |
128 def showme(wtype,buf,part): | |
129 if debug: | |
130 breakpoint() | |
131 OUT.write(b"%d\n%b"%(part,buf)) | |
132 | |
133 warc(showmeLMH,[b'response'],parts=3) | |
134 | |
135 #warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) | |
136 | |
137 #warc(showme,[b'response'],parts=int(sys.argv[2])) | |
138 #warc(showme,[b'response'],whole=True) |