changeset 108:9e5b117dc461

using Popen to run igzip (also not great)
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 22 Apr 2021 19:06:55 +0000
parents 007f35b9df9c
children 15abf4aab307
files bin/ix.py
diffstat 1 files changed, 41 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Tue Apr 20 19:11:57 2021 +0000
+++ b/bin/ix.py	Thu Apr 22 19:06:55 2021 +0000
@@ -8,7 +8,10 @@
 
 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
 
-import sys, argparse, regex, os, shutil
+import sys, argparse, regex, os, shutil, io, gzip, time
+#from isal import igzip
+from subprocess import Popen, PIPE
+#import asyncio
 
 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
 BINOUT=sys.stdout.buffer
@@ -27,25 +30,46 @@
     if not os.path.exists(rfn):
       if not os.path.exists(os.path.dirname(rfn)):
         os.makedirs(os.path.dirname(rfn))
-      with open('/beegfs/common_crawl'+filename,'rb',0) as infile, \
-              open(rfn,'wb',0) as outfile:
-        shutil.copyfileobj(infile,outfile,2048*1024)
-#         while True:
-#           l=infile.readinto(buf)
-#           if l is None:
-#             break
-#           print(l,file=sys.stderr)
-#           outfile.write(memoryview(buf)[:l])
-        infile.close()
-        outfile.close()
+      with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \
+              io.FileIO(rfn,'w') as outfile:
+        #shutil.copyfileobj(infile,outfile,128*1024*1024)
+        while True:
+          l=infile.readinto(buf)
+          if l==0:
+            break
+          outfile.write(memoryview(buf)[:l])
   file=open(rfn,'rb',0)
   if whole:
+    # try external unzip using Popen
     file.seek(offset)
     bv=memoryview(buf)[:length]
     nb=file.readinto(bv)
     if nb!=length:
       print("losing",file.name,length,nb,file=sys.stderr)
-    BINOUT.write(bv)
+    if options.zipped:
+      BINOUT.write(bv)
+    else:
+      #gzip_chunk = io.BytesIO(bv)
+      uv=memoryview(buf)[length:]
+      #clear_bytes=io.BytesIO(uv)
+      p = Popen(["/lustre/home/dc007/hst/gentoo/usr/bin/igzip",
+                                  "-dc"],
+                 stdin=PIPE,
+                 stdout=None)
+      p.stdin.write(bv)
+      p.stdin.close()
+      res=p.wait()
+      if res!=0:
+        print('pipe failed',res,p.stderr.decode())
+        exit(2)
+      file.close()
+      return
+      with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
+        while True:
+          l=gzip_fin.readinto(uv)
+          if not l:
+            break
+          BINOUT.write(memoryview(uv)[:l])
   file.close()
 
 def main():
@@ -75,6 +99,9 @@
   parser.add_argument('-r','--root',nargs='?',
                   help='File path root, create a copy there if necessary',
                   default='/beegfs/common_crawl'),
+  parser.add_argument('-z','--zipped',
+                      help="output raw gzipped record, ignored if any of -bhw supplied",
+                      action='store_true')
   sg=parser.add_mutually_exclusive_group()
   sg.add_argument('-x','--index',
                       help='take lines of triples from a cdx index file as input',
@@ -98,7 +125,7 @@
     if pa.offset is None or pa.filename is None:
       parser.error("length, offset and filename must all be supplied together")
  
-  buf=bytearray(2024*1024)
+  buf=bytearray(128*1024*1024)
 
   whole=not (pa.warc or pa.headers or pa.body)
   if pa.length is not None: