Mercurial > hg > cc > cirrus_work
changeset 42:689a0e311cd2
make warc.py a library, separate out testing
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 15:37:16 +0100 |
parents | fa43c318749b |
children | 69be1131bcc5 |
files | bin/lmh_warc.py bin/test_warc.py bin/warc.py |
diffstat | 3 files changed, 43 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/lmh_warc.py Wed Jul 05 15:37:16 2023 +0100 @@ -0,0 +1,21 @@ +import re +TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) +LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) + +def showmeLMH(wtype,buf,part): + global URI + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + else: + m=LMPAT.search(buf) + OUT.write(URI) + if m: + OUT.write(b'\t') + OUT.write(m[1]) + OUT.write(b'\n') + +warc(showmeLMH,[b'response'],parts=3) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/test_warc.py Wed Jul 05 15:37:16 2023 +0100 @@ -0,0 +1,17 @@ +import warc,sys + +OUT=open(sys.stdout.fileno(),'wb') + +if (debug:=(sys.argv[1]=='-d')): + sys.argv.pop(1) + +def showme(wtype,buf,part): + if debug: + breakpoint() + OUT.write(b"%d\n%b"%(part,buf)) + + +#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) +#warc(showme,[b'response'],whole=True) + +warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- a/bin/warc.py Wed Jul 05 15:12:54 2023 +0100 +++ b/bin/warc.py Wed Jul 05 15:37:16 2023 +0100 @@ -1,15 +1,14 @@ #!/usr/bin/env python3 -'''Stream a warc format file, invoking a callback on each record. +'''Stream a gzipped warc format file, invoking a callback on each record. Callback can be limited by WARC-Type, record part''' -import sys,os,io -if (debug:=(sys.argv[1]=='-d')): - sys.argv.pop(1) +import sys,io +from isal import igzip -def warc(callback,types=['response'],whole=False,parts=7): +def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 - stream=open(sys.argv[1],'rb',0) + stream=open(filename,'rb',0) bufsize=128*1024*1024 buf=bytearray(128*1024*1024) l=b'\r\n' @@ -104,35 +103,3 @@ state=1 L_start=rec_text.tell() -OUT=open(sys.stdout.fileno(),'wb') - -import re -TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) -LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) - -def showmeLMH(wtype,buf,part): - global URI - if part==1: - if (m:=TUPAT.search(buf)): - URI=m[1] - else: - raise ValueError(b"No target URI in %s ??"%buf) - else: - m=LMPAT.search(buf) - OUT.write(URI) - if m: - OUT.write(b'\t') - OUT.write(m[1]) - OUT.write(b'\n') - -def showme(wtype,buf,part): - if debug: - breakpoint() - OUT.write(b"%d\n%b"%(part,buf)) - -warc(showmeLMH,[b'response'],parts=3) - -#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) - -#warc(showme,[b'response'],parts=int(sys.argv[2])) -#warc(showme,[b'response'],whole=True)