Mercurial > hg > cc > cirrus_work
comparison bin/lmh_warc.py @ 55:11a886a84a49
finds multiples
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 10 Jul 2023 18:17:35 +0100 |
parents | 689a0e311cd2 |
children | b14187ccfb46 |
comparison
equal
deleted
inserted
replaced
54:9c63039a9b6d | 55:11a886a84a49 |
---|---|
1 import re | 1 import re,swarc,sys |
2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) | 2 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) |
3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) | 3 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
4 | |
5 OUT=open(sys.stdout.fileno(),'wb') | |
4 | 6 |
5 def showmeLMH(wtype,buf,part): | 7 def showmeLMH(wtype,buf,part): |
6 global URI | 8 global URI |
7 if part==1: | 9 if part==1: |
8 if (m:=TUPAT.search(buf)): | 10 if (m:=TUPAT.search(buf)): |
9 URI=m[1] | 11 URI=m[1] |
10 else: | 12 else: |
11 raise ValueError(b"No target URI in %s ??"%buf) | 13 raise ValueError(b"No target URI in %s ??"%buf) |
12 else: | 14 else: |
13 m=LMPAT.search(buf) | 15 mm=LMPAT.findall(buf) |
14 OUT.write(URI) | 16 OUT.write(URI) |
15 if m: | 17 if mm: |
16 OUT.write(b'\t') | 18 for m in mm: |
17 OUT.write(m[1]) | 19 OUT.write(b'\t') |
20 OUT.write(m) | |
18 OUT.write(b'\n') | 21 OUT.write(b'\n') |
19 | 22 |
20 warc(showmeLMH,[b'response'],parts=3) | 23 swarc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) |
21 | 24 |