Mercurial > hg > cc > cirrus_work
view bin/lmh_warc.py @ 109:52c6a9b0fc8c
loosen must-match criterion in the both-messy case
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 19 Sep 2023 19:29:41 +0100 |
parents | 86df63d251cf |
children |
line wrap: on
line source
#!/usr/bin/env python3 '''Extract identifying info + LastModified header value for all entries that have one Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' import re,warc,sys,glob,codecs TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) DTAB=bytearray(range(256)) DDEL=b'TZ-:' OUT=open(sys.stdout.fileno(),'wb') def showmeLMH(wtype,buf,part): global URI, DATE, SEGMENT, FILETYPE, FILENO if part==1: if (m:=TUPAT.search(buf)): URI=m[1] else: raise ValueError(b"No target URI in %s ??"%buf) if (md:=DPAT.search(buf)): DATE=md[1] else: raise ValueError(b"No date in %s ??"%buf) else: mm=LMPAT.search(buf) OUT.write(URI) if mm: OUT.write(b'\t') OUT.write(DATE.translate(DTAB,DDEL)) OUT.write(b'\t') OUT.write(SEGMENT) OUT.write(b'\t') OUT.write(FILETYPE) OUT.write(b'\t') OUT.write(FILENO) OUT.write(b'\t') OUT.write(mm[1]) OUT.write(b'\n') (CCdate, segment, filetype, fileno) = sys.argv[1:] fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( CCdate, segment, filetype, fileno) SEGMENT=codecs.encode(segment,'ascii') FILETYPE=codecs.encode(filetype,'ascii') FILENO=codecs.encode(fileno,'ascii') warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)