Mercurial > hg > cc > cirrus_work
annotate bin/lmh_warc.py @ 79:120d90b47d74
include timestamp
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 21 Aug 2023 13:37:07 +0100 |
parents | b14187ccfb46 |
children | 86df63d251cf |
rev | line source |
---|---|
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
1 #!/usr/bin/env python3 |
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
2 |
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
3 import re,warc,sys |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) |
79 | 5 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 |
79 | 8 DTAB=bytearray(range(256)) |
9 DDEL=b'TZ-:' | |
10 | |
55 | 11 OUT=open(sys.stdout.fileno(),'wb') |
12 | |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 def showmeLMH(wtype,buf,part): |
79 | 14 global URI, DATE |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 if part==1: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 if (m:=TUPAT.search(buf)): |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 URI=m[1] |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 else: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 raise ValueError(b"No target URI in %s ??"%buf) |
79 | 20 if (md:=DPAT.search(buf)): |
21 DATE=md[1] | |
22 else: | |
23 raise ValueError(b"No date in %s ??"%buf) | |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 else: |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
25 mm=LMPAT.search(buf) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 OUT.write(URI) |
55 | 27 if mm: |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
28 OUT.write(b'\t') |
79 | 29 OUT.write(DATE.translate(DTAB,DDEL)) |
30 OUT.write(b'\t') | |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
31 OUT.write(mm[1]) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 OUT.write(b'\n') |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
34 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 |