comparison bin/lmh_warc.py @ 64:b14187ccfb46

revert to just showing first LM
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 19 Jul 2023 13:19:42 +0100
parents 11a886a84a49
children 120d90b47d74
comparison
equal deleted inserted replaced
63:9837840f3328 64:b14187ccfb46
1 import re,swarc,sys 1 #!/usr/bin/env python3
2
3 import re,warc,sys
2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) 4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) 5 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
4 6
5 OUT=open(sys.stdout.fileno(),'wb') 7 OUT=open(sys.stdout.fileno(),'wb')
6 8
10 if (m:=TUPAT.search(buf)): 12 if (m:=TUPAT.search(buf)):
11 URI=m[1] 13 URI=m[1]
12 else: 14 else:
13 raise ValueError(b"No target URI in %s ??"%buf) 15 raise ValueError(b"No target URI in %s ??"%buf)
14 else: 16 else:
15 mm=LMPAT.findall(buf) 17 mm=LMPAT.search(buf)
16 OUT.write(URI) 18 OUT.write(URI)
17 if mm: 19 if mm:
18 for m in mm: 20 OUT.write(b'\t')
19 OUT.write(b'\t') 21 OUT.write(mm[1])
20 OUT.write(m)
21 OUT.write(b'\n') 22 OUT.write(b'\n')
22 23
23 swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) 24 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)
24 25