changeset 95:86df63d251cf

version which outputs more identification, may not be needed
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 08 Sep 2023 09:29:25 +0100
parents 009e633eb804
children e1a05ead2b1c
files bin/lmh_warc.py
diffstat 1 files changed, 22 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/bin/lmh_warc.py	Thu Sep 07 18:03:55 2023 +0100
+++ b/bin/lmh_warc.py	Fri Sep 08 09:29:25 2023 +0100
@@ -1,6 +1,11 @@
 #!/usr/bin/env python3
+'''Extract identifying info + LastModified header value for all entries
+   that have one
 
-import re,warc,sys
+   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''
+
+import re,warc,sys,glob,codecs
+
 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
@@ -11,7 +16,7 @@
 OUT=open(sys.stdout.fileno(),'wb')
 
 def showmeLMH(wtype,buf,part):
-  global URI, DATE
+  global URI, DATE, SEGMENT, FILETYPE, FILENO
   if part==1:
     if (m:=TUPAT.search(buf)):
       URI=m[1]
@@ -28,8 +33,22 @@
       OUT.write(b'\t')
       OUT.write(DATE.translate(DTAB,DDEL))
       OUT.write(b'\t')
+      OUT.write(SEGMENT)
+      OUT.write(b'\t')
+      OUT.write(FILETYPE)
+      OUT.write(b'\t')
+      OUT.write(FILENO)
+      OUT.write(b'\t')
       OUT.write(mm[1])
     OUT.write(b'\n')
 
-warc.warc(sys.argv[1],showmeLMH,[b'response'],parts=3)
+(CCdate, segment, filetype, fileno) = sys.argv[1:]
+fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
+  CCdate, segment, filetype, fileno)
 
+SEGMENT=codecs.encode(segment,'ascii')
+FILETYPE=codecs.encode(filetype,'ascii')
+FILENO=codecs.encode(fileno,'ascii')
+
+warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
+