view bin/lmh_warc.py @ 57:61b0a1582af8

works with all types, part=1
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 12 Jul 2023 18:48:27 +0100
parents 11a886a84a49
children b14187ccfb46
line wrap: on
line source

import re,swarc,sys
TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)

OUT=open(sys.stdout.fileno(),'wb')

def showmeLMH(wtype,buf,part):
  global URI
  if part==1:
    if (m:=TUPAT.search(buf)):
      URI=m[1]
    else:
      raise ValueError(b"No target URI in %s ??"%buf)
  else:
    mm=LMPAT.findall(buf)
    OUT.write(URI)
    if mm:
      for m in mm:
        OUT.write(b'\t')
        OUT.write(m)
    OUT.write(b'\n')

swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)