Mercurial > hg > cc > cirrus_work
diff bin/lmh_warc.py @ 42:689a0e311cd2
make warc.py a library, separate out testing
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 15:37:16 +0100 |
parents | |
children | 11a886a84a49 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/lmh_warc.py Wed Jul 05 15:37:16 2023 +0100 @@ -0,0 +1,21 @@ +import re +TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) +LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) + +def showmeLMH(wtype,buf,part): + global URI + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + else: + m=LMPAT.search(buf) + OUT.write(URI) + if m: + OUT.write(b'\t') + OUT.write(m[1]) + OUT.write(b'\n') + +warc(showmeLMH,[b'response'],parts=3) +