comparison bin/lmh_warc.py @ 55:11a886a84a49

finds multiples
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 10 Jul 2023 18:17:35 +0100
parents 689a0e311cd2
children b14187ccfb46
comparison
equal deleted inserted replaced
54:9c63039a9b6d 55:11a886a84a49
1 import re 1 import re,swarc,sys
2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) 2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) 3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
4
5 OUT=open(sys.stdout.fileno(),'wb')
4 6
5 def showmeLMH(wtype,buf,part): 7 def showmeLMH(wtype,buf,part):
6 global URI 8 global URI
7 if part==1: 9 if part==1:
8 if (m:=TUPAT.search(buf)): 10 if (m:=TUPAT.search(buf)):
9 URI=m[1] 11 URI=m[1]
10 else: 12 else:
11 raise ValueError(b"No target URI in %s ??"%buf) 13 raise ValueError(b"No target URI in %s ??"%buf)
12 else: 14 else:
13 m=LMPAT.search(buf) 15 mm=LMPAT.findall(buf)
14 OUT.write(URI) 16 OUT.write(URI)
15 if m: 17 if mm:
16 OUT.write(b'\t') 18 for m in mm:
17 OUT.write(m[1]) 19 OUT.write(b'\t')
20 OUT.write(m)
18 OUT.write(b'\n') 21 OUT.write(b'\n')
19 22
20 warc(showmeLMH,[b'response'],parts=3) 23 swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)
21 24