Mercurial > hg > cc > cirrus_work
annotate bin/lmh_warc.py @ 46:44d3a4f4ea51
support on-board unzipping, reduce buffer size to 2MB
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 19:32:36 +0100 |
parents | 689a0e311cd2 |
children | 11a886a84a49 |
rev | line source |
---|---|
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 import re |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 def showmeLMH(wtype,buf,part): |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 global URI |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 if part==1: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 if (m:=TUPAT.search(buf)): |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 URI=m[1] |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 else: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 raise ValueError(b"No target URI in %s ??"%buf) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 else: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 m=LMPAT.search(buf) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 OUT.write(URI) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 if m: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 OUT.write(b'\t') |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 OUT.write(m[1]) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 OUT.write(b'\n') |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 warc(showmeLMH,[b'response'],parts=3) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 |