Mercurial > hg > cc > cirrus_work
comparison bin/lmh_warc.py @ 79:120d90b47d74
include timestamp
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 21 Aug 2023 13:37:07 +0100 |
parents | b14187ccfb46 |
children | 86df63d251cf |
comparison
equal
deleted
inserted
replaced
78:fef49258d738 | 79:120d90b47d74 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 | 2 |
3 import re,warc,sys | 3 import re,warc,sys |
4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) | 4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) |
5 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) | |
5 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) | 6 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
7 | |
8 DTAB=bytearray(range(256)) | |
9 DDEL=b'TZ-:' | |
6 | 10 |
7 OUT=open(sys.stdout.fileno(),'wb') | 11 OUT=open(sys.stdout.fileno(),'wb') |
8 | 12 |
9 def showmeLMH(wtype,buf,part): | 13 def showmeLMH(wtype,buf,part): |
10 global URI | 14 global URI, DATE |
11 if part==1: | 15 if part==1: |
12 if (m:=TUPAT.search(buf)): | 16 if (m:=TUPAT.search(buf)): |
13 URI=m[1] | 17 URI=m[1] |
14 else: | 18 else: |
15 raise ValueError(b"No target URI in %s ??"%buf) | 19 raise ValueError(b"No target URI in %s ??"%buf) |
20 if (md:=DPAT.search(buf)): | |
21 DATE=md[1] | |
22 else: | |
23 raise ValueError(b"No date in %s ??"%buf) | |
16 else: | 24 else: |
17 mm=LMPAT.search(buf) | 25 mm=LMPAT.search(buf) |
18 OUT.write(URI) | 26 OUT.write(URI) |
19 if mm: | 27 if mm: |
28 OUT.write(b'\t') | |
29 OUT.write(DATE.translate(DTAB,DDEL)) | |
20 OUT.write(b'\t') | 30 OUT.write(b'\t') |
21 OUT.write(mm[1]) | 31 OUT.write(mm[1]) |
22 OUT.write(b'\n') | 32 OUT.write(b'\n') |
23 | 33 |
24 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) | 34 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) |