diff bin/ix.py @ 107:007f35b9df9c

added support for copying to/using /dev/shm or /tmp
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 20 Apr 2021 19:11:57 +0000
parents 815b33c3254a
children 9e5b117dc461
line wrap: on
line diff
--- a/bin/ix.py	Tue Apr 20 12:26:09 2021 +0000
+++ b/bin/ix.py	Tue Apr 20 19:11:57 2021 +0000
@@ -8,11 +8,11 @@
 
 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
 
-import sys, argparse, regex
+import sys, argparse, regex, os, shutil
 
 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
-FPAT="/beegfs/common_crawl/%s/%s/orig/warc/%s"
 BINOUT=sys.stdout.buffer
+FPAT="/%s/%s/orig/warc/%s"
 
 class HackFormat(argparse.RawDescriptionHelpFormatter):
   def format_help(self):
@@ -21,8 +21,24 @@
     return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
                           FOO)
 
-def process(options,buf,filename,offset,length,whole):
-  file=open(filename,'rb',0)
+def process(options,buf,root,filename,offset,length,whole):
+  rfn=root+filename
+  if root!="/beegfs/common_crawl":
+    if not os.path.exists(rfn):
+      if not os.path.exists(os.path.dirname(rfn)):
+        os.makedirs(os.path.dirname(rfn))
+      with open('/beegfs/common_crawl'+filename,'rb',0) as infile, \
+              open(rfn,'wb',0) as outfile:
+        shutil.copyfileobj(infile,outfile,2048*1024)
+#         while True:
+#           l=infile.readinto(buf)
+#           if l is None:
+#             break
+#           print(l,file=sys.stderr)
+#           outfile.write(memoryview(buf)[:l])
+        infile.close()
+        outfile.close()
+  file=open(rfn,'rb',0)
   if whole:
     file.seek(offset)
     bv=memoryview(buf)[:length]
@@ -56,6 +72,9 @@
   parser.add_argument('-b','--body',help='output HTTP body',
                       action='store_true')
   parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
+  parser.add_argument('-r','--root',nargs='?',
+                  help='File path root, create a copy there if necessary',
+                  default='/beegfs/common_crawl'),
   sg=parser.add_mutually_exclusive_group()
   sg.add_argument('-x','--index',
                       help='take lines of triples from a cdx index file as input',
@@ -83,7 +102,8 @@
 
   whole=not (pa.warc or pa.headers or pa.body)
   if pa.length is not None:
-    process(pa,buf,FPAT%tuple(pa.filename.split('/')),pa.offset,pa.length,whole)
+    process(pa,buf,pa.root,FPAT%list(pa.filename.split('/')),
+            pa.offset,pa.length,whole)
     exit(0)
   if pa.index:
     CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
@@ -93,7 +113,8 @@
         print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
         exit(2)
       f=FPAT%(m[3:6])
-      process(pa,buf,f,int(m[2]),int(m[1]),whole)
+      process(pa,buf,pa.root,f,
+              int(m[2]),int(m[1]),whole)
     exit(0)
 if __name__ == "__main__":
     main()