Mercurial > hg > cc > cirrus_work
diff bin/warc.py @ 42:689a0e311cd2
make warc.py a library, separate out testing
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 15:37:16 +0100 |
parents | 8661062a50b1 |
children | 69be1131bcc5 |
line wrap: on
line diff
--- a/bin/warc.py Wed Jul 05 15:12:54 2023 +0100 +++ b/bin/warc.py Wed Jul 05 15:37:16 2023 +0100 @@ -1,15 +1,14 @@ #!/usr/bin/env python3 -'''Stream a warc format file, invoking a callback on each record. +'''Stream a gzipped warc format file, invoking a callback on each record. Callback can be limited by WARC-Type, record part''' -import sys,os,io -if (debug:=(sys.argv[1]=='-d')): - sys.argv.pop(1) +import sys,io +from isal import igzip -def warc(callback,types=['response'],whole=False,parts=7): +def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 - stream=open(sys.argv[1],'rb',0) + stream=open(filename,'rb',0) bufsize=128*1024*1024 buf=bytearray(128*1024*1024) l=b'\r\n' @@ -104,35 +103,3 @@ state=1 L_start=rec_text.tell() -OUT=open(sys.stdout.fileno(),'wb') - -import re -TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) -LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) - -def showmeLMH(wtype,buf,part): - global URI - if part==1: - if (m:=TUPAT.search(buf)): - URI=m[1] - else: - raise ValueError(b"No target URI in %s ??"%buf) - else: - m=LMPAT.search(buf) - OUT.write(URI) - if m: - OUT.write(b'\t') - OUT.write(m[1]) - OUT.write(b'\n') - -def showme(wtype,buf,part): - if debug: - breakpoint() - OUT.write(b"%d\n%b"%(part,buf)) - -warc(showmeLMH,[b'response'],parts=3) - -#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) - -#warc(showme,[b'response'],parts=int(sys.argv[2])) -#warc(showme,[b'response'],whole=True)