changeset 165:26dfef7854f4

get in/out file management working right
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 31 Oct 2023 14:04:24 +0000
parents 4315a36b1672
children cac9fb70a4ca
files lib/python/cc/lmh/lmh.py
diffstat 1 files changed, 14 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/lmh.py	Tue Oct 31 14:03:02 2023 +0000
+++ b/lib/python/cc/lmh/lmh.py	Tue Oct 31 14:04:24 2023 +0000
@@ -2,11 +2,11 @@
 '''Extract identifying info + LastModified header value for all entries
    that have one
 
-   Usage: lmh.py CC-date segment filetype 3-digit-fileno [1]
+   Usage: lmh.py CC-date segment filetype 3-digit-fileno output-dir [1]
    Includes input identity columns in output if final arg is 1'''
 
 
-import re,warc,sys,glob,codecs
+import re,warc,sys,glob,codecs,os.path
 
 TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
 DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
@@ -43,21 +43,27 @@
       OUT.write(mm[1])
     OUT.write(b'\n')
 
-def main(CCdate, segment, filetype, fileno, extras=False):
+def main(CCdate, segment, filetype, fileno, outdir, extras=False):
   global SEGMENT, FILETYPE, FILENO, EXTRAS, OUT
 
-  OUT=open(sys.stdout.fileno(),'wb')
+  infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
+    CCdate, segment, filetype, fileno)
+  infile_name=glob.glob(infile_pat)[0]
 
-  fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
-    CCdate, segment, filetype, fileno)
+  (_,_,_,_,_,_,_,ff)=infile_name.split('/')
+
+  outfile_name="%s/%s"%(outdir,os.path.splitext(ff)[0])
+
+  OUT=open(outfile_name,'wb')
 
   if EXTRAS:=bool(extras):
     SEGMENT=codecs.encode(segment,'ascii')
     FILETYPE=codecs.encode(filetype,'ascii')
     FILENO=codecs.encode(fileno,'ascii')
   
-
-  warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)
+  warc.warc(infile_name,showmeLMH,[b'response'],parts=3)
+  
+  print(outfile_name)
 
 if __name__ == '__main__':
   sys.exit(main(*sys.argv[1:]))