Mercurial > hg > cc > cirrus_work
comparison bin/lmh_warc.py @ 64:b14187ccfb46
revert to just showing first LM
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 19 Jul 2023 13:19:42 +0100 |
parents | 11a886a84a49 |
children | 120d90b47d74 |
comparison
equal
deleted
inserted
replaced
63:9837840f3328 | 64:b14187ccfb46 |
---|---|
1 import re,swarc,sys | 1 #!/usr/bin/env python3 |
2 | |
3 import re,warc,sys | |
2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) | 4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) |
3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) | 5 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
4 | 6 |
5 OUT=open(sys.stdout.fileno(),'wb') | 7 OUT=open(sys.stdout.fileno(),'wb') |
6 | 8 |
10 if (m:=TUPAT.search(buf)): | 12 if (m:=TUPAT.search(buf)): |
11 URI=m[1] | 13 URI=m[1] |
12 else: | 14 else: |
13 raise ValueError(b"No target URI in %s ??"%buf) | 15 raise ValueError(b"No target URI in %s ??"%buf) |
14 else: | 16 else: |
15 mm=LMPAT.findall(buf) | 17 mm=LMPAT.search(buf) |
16 OUT.write(URI) | 18 OUT.write(URI) |
17 if mm: | 19 if mm: |
18 for m in mm: | 20 OUT.write(b'\t') |
19 OUT.write(b'\t') | 21 OUT.write(mm[1]) |
20 OUT.write(m) | |
21 OUT.write(b'\n') | 22 OUT.write(b'\n') |
22 | 23 |
23 swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) | 24 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) |
24 | 25 |