Mercurial > hg > cc > cirrus_work
changeset 301:4981c41628dd trim
works
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Tue, 13 May 2025 12:04:01 +0100 |
| parents | 1c11117bb01b |
| children | 13414b0dfefb |
| files | lib/python/cc/lmh/new_key.py |
| diffstat | 1 files changed, 15 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/new_key.py Thu May 08 19:00:26 2025 +0100 +++ b/lib/python/cc/lmh/new_key.py Tue May 13 12:04:01 2025 +0100 @@ -2,11 +2,23 @@ '''Extract/construct a cut-down key for cdb''' import re, sys -C_PAT = re.compile('[^ ]* ([^ ]*) .*{"url": "http([^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/') +C_PAT = re.compile('[^ ]* ([^ ]*) .*{"url": "(http[^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/') for l in sys.stdin: if (m:=C_PAT.match(l)): - print(m[1],m[2],m[3],m[4]) + (wdate, uri, seg, kind) = m.groups() else: - print('oops',l) + print('oops',file=sys.stderr) + print(l,file=sys.stderr) exit(1) + + if kind == 'robotstxt': + wdate += seg + if wdate.startswith('201908'): + wdate = wdate[6:] + if uri.startswith('http'): + print(wdate,uri[4:],sep='') + else: + print("should't happen",uri,file=sys.stderr) + exit(2) +
