Mercurial > hg > cc > cirrus_work
diff bin/lmh_warc.py @ 64:b14187ccfb46
revert to just showing first LM
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 19 Jul 2023 13:19:42 +0100 |
parents | 11a886a84a49 |
children | 120d90b47d74 |
line wrap: on
line diff
--- a/bin/lmh_warc.py Fri Jul 14 17:39:14 2023 +0100 +++ b/bin/lmh_warc.py Wed Jul 19 13:19:42 2023 +0100 @@ -1,4 +1,6 @@ -import re,swarc,sys +#!/usr/bin/env python3 + +import re,warc,sys TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) @@ -12,13 +14,12 @@ else: raise ValueError(b"No target URI in %s ??"%buf) else: - mm=LMPAT.findall(buf) + mm=LMPAT.search(buf) OUT.write(URI) if mm: - for m in mm: - OUT.write(b'\t') - OUT.write(m) + OUT.write(b'\t') + OUT.write(mm[1]) OUT.write(b'\n') -swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) +warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)