# HG changeset patch # User Henry S. Thompson # Date 1698761064 0 # Node ID 26dfef7854f4a947b7043257932dfae777ac8c3d # Parent 4315a36b1672a5444e1253829b8d5820f4b02b71 get in/out file management working right diff -r 4315a36b1672 -r 26dfef7854f4 lib/python/cc/lmh/lmh.py --- a/lib/python/cc/lmh/lmh.py Tue Oct 31 14:03:02 2023 +0000 +++ b/lib/python/cc/lmh/lmh.py Tue Oct 31 14:04:24 2023 +0000 @@ -2,11 +2,11 @@ '''Extract identifying info + LastModified header value for all entries that have one - Usage: lmh.py CC-date segment filetype 3-digit-fileno [1] + Usage: lmh.py CC-date segment filetype 3-digit-fileno output-dir [1] Includes input identity columns in output if final arg is 1''' -import re,warc,sys,glob,codecs +import re,warc,sys,glob,codecs,os.path TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) @@ -43,21 +43,27 @@ OUT.write(mm[1]) OUT.write(b'\n') -def main(CCdate, segment, filetype, fileno, extras=False): +def main(CCdate, segment, filetype, fileno, outdir, extras=False): global SEGMENT, FILETYPE, FILENO, EXTRAS, OUT - OUT=open(sys.stdout.fileno(),'wb') + infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) + infile_name=glob.glob(infile_pat)[0] - fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( - CCdate, segment, filetype, fileno) + (_,_,_,_,_,_,_,ff)=infile_name.split('/') + + outfile_name="%s/%s"%(outdir,os.path.splitext(ff)[0]) + + OUT=open(outfile_name,'wb') if EXTRAS:=bool(extras): SEGMENT=codecs.encode(segment,'ascii') FILETYPE=codecs.encode(filetype,'ascii') FILENO=codecs.encode(fileno,'ascii') - - warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) + warc.warc(infile_name,showmeLMH,[b'response'],parts=3) + + print(outfile_name) if __name__ == '__main__': sys.exit(main(*sys.argv[1:]))