Mercurial > hg > cc > cirrus_work
changeset 165:26dfef7854f4
get in/out file management working right
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 31 Oct 2023 14:04:24 +0000 |
parents | 4315a36b1672 |
children | cac9fb70a4ca |
files | lib/python/cc/lmh/lmh.py |
diffstat | 1 files changed, 14 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/lmh.py Tue Oct 31 14:03:02 2023 +0000 +++ b/lib/python/cc/lmh/lmh.py Tue Oct 31 14:04:24 2023 +0000 @@ -2,11 +2,11 @@ '''Extract identifying info + LastModified header value for all entries that have one - Usage: lmh.py CC-date segment filetype 3-digit-fileno [1] + Usage: lmh.py CC-date segment filetype 3-digit-fileno output-dir [1] Includes input identity columns in output if final arg is 1''' -import re,warc,sys,glob,codecs +import re,warc,sys,glob,codecs,os.path TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) @@ -43,21 +43,27 @@ OUT.write(mm[1]) OUT.write(b'\n') -def main(CCdate, segment, filetype, fileno, extras=False): +def main(CCdate, segment, filetype, fileno, outdir, extras=False): global SEGMENT, FILETYPE, FILENO, EXTRAS, OUT - OUT=open(sys.stdout.fileno(),'wb') + infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( + CCdate, segment, filetype, fileno) + infile_name=glob.glob(infile_pat)[0] - fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( - CCdate, segment, filetype, fileno) + (_,_,_,_,_,_,_,ff)=infile_name.split('/') + + outfile_name="%s/%s"%(outdir,os.path.splitext(ff)[0]) + + OUT=open(outfile_name,'wb') if EXTRAS:=bool(extras): SEGMENT=codecs.encode(segment,'ascii') FILETYPE=codecs.encode(filetype,'ascii') FILENO=codecs.encode(fileno,'ascii') - - warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3) + warc.warc(infile_name,showmeLMH,[b'response'],parts=3) + + print(outfile_name) if __name__ == '__main__': sys.exit(main(*sys.argv[1:]))