Mercurial > hg > cc > cirrus_work
annotate bin/lmh_warc.py @ 109:52c6a9b0fc8c
loosen must-match criterion in the both-messy case
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 19 Sep 2023 19:29:41 +0100 |
parents | 86df63d251cf |
children |
rev | line source |
---|---|
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
1 #!/usr/bin/env python3 |
95
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
2 '''Extract identifying info + LastModified header value for all entries |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
3 that have one |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
4 |
95
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
5 Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
6 |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
7 import re,warc,sys,glob,codecs |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
8 |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) |
79 | 10 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
79 | 13 DTAB=bytearray(range(256)) |
14 DDEL=b'TZ-:' | |
15 | |
55 | 16 OUT=open(sys.stdout.fileno(),'wb') |
17 | |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 def showmeLMH(wtype,buf,part): |
95
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
19 global URI, DATE, SEGMENT, FILETYPE, FILENO |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 if part==1: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 if (m:=TUPAT.search(buf)): |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 URI=m[1] |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 else: |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 raise ValueError(b"No target URI in %s ??"%buf) |
79 | 25 if (md:=DPAT.search(buf)): |
26 DATE=md[1] | |
27 else: | |
28 raise ValueError(b"No date in %s ??"%buf) | |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 else: |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
30 mm=LMPAT.search(buf) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 OUT.write(URI) |
55 | 32 if mm: |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
33 OUT.write(b'\t') |
79 | 34 OUT.write(DATE.translate(DTAB,DDEL)) |
35 OUT.write(b'\t') | |
95
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
36 OUT.write(SEGMENT) |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
37 OUT.write(b'\t') |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
38 OUT.write(FILETYPE) |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
39 OUT.write(b'\t') |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
40 OUT.write(FILENO) |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
41 OUT.write(b'\t') |
64
b14187ccfb46
revert to just showing first LM
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
42 OUT.write(mm[1]) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 OUT.write(b'\n') |
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 |
95
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
45 (CCdate, segment, filetype, fileno) = sys.argv[1:] |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
46 fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
47 CCdate, segment, filetype, fileno) |
42
689a0e311cd2
make warc.py a library, separate out testing
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 |
95
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
49 SEGMENT=codecs.encode(segment,'ascii') |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
50 FILETYPE=codecs.encode(filetype,'ascii') |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
51 FILENO=codecs.encode(fileno,'ascii') |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
52 |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
53 warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) |
86df63d251cf
version which outputs more identification,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
54 |