Mercurial > hg > cc > cirrus_work
changeset 95:86df63d251cf
version which outputs more identification,
may not be needed
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 08 Sep 2023 09:29:25 +0100 |
parents | 009e633eb804 |
children | e1a05ead2b1c |
files | bin/lmh_warc.py |
diffstat | 1 files changed, 22 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/lmh_warc.py Thu Sep 07 18:03:55 2023 +0100 +++ b/bin/lmh_warc.py Fri Sep 08 09:29:25 2023 +0100 @@ -1,6 +1,11 @@ #!/usr/bin/env python3 +'''Extract identifying info + LastModified header value for all entries + that have one -import re,warc,sys + Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' + +import re,warc,sys,glob,codecs + TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) @@ -11,7 +16,7 @@ OUT=open(sys.stdout.fileno(),'wb') def showmeLMH(wtype,buf,part): - global URI, DATE + global URI, DATE, SEGMENT, FILETYPE, FILENO if part==1: if (m:=TUPAT.search(buf)): URI=m[1] @@ -28,8 +33,22 @@ OUT.write(b'\t') OUT.write(DATE.translate(DTAB,DDEL)) OUT.write(b'\t') + OUT.write(SEGMENT) + OUT.write(b'\t') + OUT.write(FILETYPE) + OUT.write(b'\t') + OUT.write(FILENO) + OUT.write(b'\t') OUT.write(mm[1]) OUT.write(b'\n') -warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) +(CCdate, segment, filetype, fileno) = sys.argv[1:] +fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) +SEGMENT=codecs.encode(segment,'ascii') +FILETYPE=codecs.encode(filetype,'ascii') +FILENO=codecs.encode(fileno,'ascii') + +warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) +