Mercurial > hg > cc > cirrus_home
changeset 108:9e5b117dc461
using Popen to run igzip (also not great)
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 22 Apr 2021 19:06:55 +0000 |
parents | 007f35b9df9c |
children | 15abf4aab307 |
files | bin/ix.py |
diffstat | 1 files changed, 41 insertions(+), 14 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.py Tue Apr 20 19:11:57 2021 +0000 +++ b/bin/ix.py Thu Apr 22 19:06:55 2021 +0000 @@ -8,7 +8,10 @@ Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' -import sys, argparse, regex, os, shutil +import sys, argparse, regex, os, shutil, io, gzip, time +#from isal import igzip +from subprocess import Popen, PIPE +#import asyncio HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') BINOUT=sys.stdout.buffer @@ -27,25 +30,46 @@ if not os.path.exists(rfn): if not os.path.exists(os.path.dirname(rfn)): os.makedirs(os.path.dirname(rfn)) - with open('/beegfs/common_crawl'+filename,'rb',0) as infile, \ - open(rfn,'wb',0) as outfile: - shutil.copyfileobj(infile,outfile,2048*1024) -# while True: -# l=infile.readinto(buf) -# if l is None: -# break -# print(l,file=sys.stderr) -# outfile.write(memoryview(buf)[:l]) - infile.close() - outfile.close() + with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ + io.FileIO(rfn,'w') as outfile: + #shutil.copyfileobj(infile,outfile,128*1024*1024) + while True: + l=infile.readinto(buf) + if l==0: + break + outfile.write(memoryview(buf)[:l]) file=open(rfn,'rb',0) if whole: + # try external unzip using Popen file.seek(offset) bv=memoryview(buf)[:length] nb=file.readinto(bv) if nb!=length: print("losing",file.name,length,nb,file=sys.stderr) - BINOUT.write(bv) + if options.zipped: + BINOUT.write(bv) + else: + #gzip_chunk = io.BytesIO(bv) + uv=memoryview(buf)[length:] + #clear_bytes=io.BytesIO(uv) + p = Popen(["/lustre/home/dc007/hst/gentoo/usr/bin/igzip", + "-dc"], + stdin=PIPE, + stdout=None) + p.stdin.write(bv) + p.stdin.close() + res=p.wait() + if res!=0: + print('pipe failed',res,p.stderr.decode()) + exit(2) + file.close() + return + with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: + while True: + l=gzip_fin.readinto(uv) + if not l: + break + BINOUT.write(memoryview(uv)[:l]) file.close() def main(): @@ -75,6 +99,9 @@ parser.add_argument('-r','--root',nargs='?', help='File path root, create a copy there if necessary', default='/beegfs/common_crawl'), + parser.add_argument('-z','--zipped', + help="output raw gzipped record, ignored if any of -bhw supplied", + action='store_true') sg=parser.add_mutually_exclusive_group() sg.add_argument('-x','--index', help='take lines of triples from a cdx index file as input', @@ -98,7 +125,7 @@ if pa.offset is None or pa.filename is None: parser.error("length, offset and filename must all be supplied together") - buf=bytearray(2024*1024) + buf=bytearray(128*1024*1024) whole=not (pa.warc or pa.headers or pa.body) if pa.length is not None: