# HG changeset patch # User Henry S. Thompson # Date 1694161765 -3600 # Node ID 86df63d251cf7a0da0f10017346bed7627ebe413 # Parent 009e633eb804dc6ce4c3cfae3f5bfde8019a1674 version which outputs more identification, may not be needed diff -r 009e633eb804 -r 86df63d251cf bin/lmh_warc.py --- a/bin/lmh_warc.py Thu Sep 07 18:03:55 2023 +0100 +++ b/bin/lmh_warc.py Fri Sep 08 09:29:25 2023 +0100 @@ -1,6 +1,11 @@ #!/usr/bin/env python3 +'''Extract identifying info + LastModified header value for all entries + that have one -import re,warc,sys + Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno''' + +import re,warc,sys,glob,codecs + TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) @@ -11,7 +16,7 @@ OUT=open(sys.stdout.fileno(),'wb') def showmeLMH(wtype,buf,part): - global URI, DATE + global URI, DATE, SEGMENT, FILETYPE, FILENO if part==1: if (m:=TUPAT.search(buf)): URI=m[1] @@ -28,8 +33,22 @@ OUT.write(b'\t') OUT.write(DATE.translate(DTAB,DDEL)) OUT.write(b'\t') + OUT.write(SEGMENT) + OUT.write(b'\t') + OUT.write(FILETYPE) + OUT.write(b'\t') + OUT.write(FILENO) + OUT.write(b'\t') OUT.write(mm[1]) OUT.write(b'\n') -warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3) +(CCdate, segment, filetype, fileno) = sys.argv[1:] +fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) +SEGMENT=codecs.encode(segment,'ascii') +FILETYPE=codecs.encode(filetype,'ascii') +FILENO=codecs.encode(fileno,'ascii') + +warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) +