Mercurial > hg > cc > cirrus_work
view bin/lmh_warc.py @ 80:db3c689175fe
catching up by hand with markup version,
adding query string
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 19 Aug 2023 15:53:59 -0400 |
parents | b14187ccfb46 |
children | 120d90b47d74 |
line wrap: on
line source
#!/usr/bin/env python3 import re,warc,sys TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) OUT=open(sys.stdout.fileno(),'wb') def showmeLMH(wtype,buf,part): global URI if part==1: if (m:=TUPAT.search(buf)): URI=m[1] else: raise ValueError(b"No target URI in %s ??"%buf) else: mm=LMPAT.search(buf) OUT.write(URI) if mm: OUT.write(b'\t') OUT.write(mm[1]) OUT.write(b'\n') warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)