comparison bin/lmh_warc.py @ 79:120d90b47d74

include timestamp
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 21 Aug 2023 13:37:07 +0100
parents b14187ccfb46
children 86df63d251cf
comparison
equal deleted inserted replaced
78:fef49258d738 79:120d90b47d74
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 2
3 import re,warc,sys 3 import re,warc,sys
4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) 4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
5 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
5 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) 6 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
7
8 DTAB=bytearray(range(256))
9 DDEL=b'TZ-:'
6 10
7 OUT=open(sys.stdout.fileno(),'wb') 11 OUT=open(sys.stdout.fileno(),'wb')
8 12
9 def showmeLMH(wtype,buf,part): 13 def showmeLMH(wtype,buf,part):
10 global URI 14 global URI, DATE
11 if part==1: 15 if part==1:
12 if (m:=TUPAT.search(buf)): 16 if (m:=TUPAT.search(buf)):
13 URI=m[1] 17 URI=m[1]
14 else: 18 else:
15 raise ValueError(b"No target URI in %s ??"%buf) 19 raise ValueError(b"No target URI in %s ??"%buf)
20 if (md:=DPAT.search(buf)):
21 DATE=md[1]
22 else:
23 raise ValueError(b"No date in %s ??"%buf)
16 else: 24 else:
17 mm=LMPAT.search(buf) 25 mm=LMPAT.search(buf)
18 OUT.write(URI) 26 OUT.write(URI)
19 if mm: 27 if mm:
28 OUT.write(b'\t')
29 OUT.write(DATE.translate(DTAB,DDEL))
20 OUT.write(b'\t') 30 OUT.write(b'\t')
21 OUT.write(mm[1]) 31 OUT.write(mm[1])
22 OUT.write(b'\n') 32 OUT.write(b'\n')
23 33
24 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) 34 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)