Mercurial > hg > cc > cirrus_work
changeset 55:11a886a84a49
finds multiples
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 10 Jul 2023 18:17:35 +0100 |
parents | 9c63039a9b6d |
children | f8c8f79b2532 |
files | bin/lmh_warc.py |
diffstat | 1 files changed, 9 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/lmh_warc.py Fri Jul 07 19:30:23 2023 +0100 +++ b/bin/lmh_warc.py Mon Jul 10 18:17:35 2023 +0100 @@ -1,7 +1,9 @@ -import re +import re,swarc,sys TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) +OUT=open(sys.stdout.fileno(),'wb') + def showmeLMH(wtype,buf,part): global URI if part==1: @@ -10,12 +12,13 @@ else: raise ValueError(b"No target URI in %s ??"%buf) else: - m=LMPAT.search(buf) + mm=LMPAT.findall(buf) OUT.write(URI) - if m: - OUT.write(b'\t') - OUT.write(m[1]) + if mm: + for m in mm: + OUT.write(b'\t') + OUT.write(m) OUT.write(b'\n') -warc(showmeLMH,[b'response'],parts=3) +swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)