annotate bin/lmh_warc.py @ 79:120d90b47d74

include timestamp
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 21 Aug 2023 13:37:07 +0100
parents b14187ccfb46
children 86df63d251cf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
64
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
1 #!/usr/bin/env python3
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
2
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
3 import re,warc,sys
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
79
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
5 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7
79
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
8 DTAB=bytearray(range(256))
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
9 DDEL=b'TZ-:'
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
10
55
11a886a84a49 finds multiples
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
11 OUT=open(sys.stdout.fileno(),'wb')
11a886a84a49 finds multiples
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
12
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 def showmeLMH(wtype,buf,part):
79
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
14 global URI, DATE
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 if part==1:
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 if (m:=TUPAT.search(buf)):
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 URI=m[1]
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 else:
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 raise ValueError(b"No target URI in %s ??"%buf)
79
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
20 if (md:=DPAT.search(buf)):
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
21 DATE=md[1]
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
22 else:
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
23 raise ValueError(b"No date in %s ??"%buf)
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 else:
64
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
25 mm=LMPAT.search(buf)
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 OUT.write(URI)
55
11a886a84a49 finds multiples
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
27 if mm:
64
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
28 OUT.write(b'\t')
79
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
29 OUT.write(DATE.translate(DTAB,DDEL))
120d90b47d74 include timestamp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 64
diff changeset
30 OUT.write(b'\t')
64
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
31 OUT.write(mm[1])
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 OUT.write(b'\n')
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33
64
b14187ccfb46 revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
34 warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)
42
689a0e311cd2 make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35