comparison bin/ix.py @ 107:007f35b9df9c

added support for copying to/using /dev/shm or /tmp
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 20 Apr 2021 19:11:57 +0000
parents 815b33c3254a
children 9e5b117dc461
comparison
equal deleted inserted replaced
106:815b33c3254a 107:007f35b9df9c
6 or complete cdx index lines. 6 or complete cdx index lines.
7 In all cases by 'filename' is meant crawlid/segmentid/filename 7 In all cases by 'filename' is meant crawlid/segmentid/filename
8 8
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' 9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
10 10
11 import sys, argparse, regex 11 import sys, argparse, regex, os, shutil
12 12
13 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') 13 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
14 FPAT="/beegfs/common_crawl/%s/%s/orig/warc/%s"
15 BINOUT=sys.stdout.buffer 14 BINOUT=sys.stdout.buffer
15 FPAT="/%s/%s/orig/warc/%s"
16 16
17 class HackFormat(argparse.RawDescriptionHelpFormatter): 17 class HackFormat(argparse.RawDescriptionHelpFormatter):
18 def format_help(self): 18 def format_help(self):
19 global FOO 19 global FOO
20 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) 20 FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
21 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', 21 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]',
22 FOO) 22 FOO)
23 23
24 def process(options,buf,filename,offset,length,whole): 24 def process(options,buf,root,filename,offset,length,whole):
25 file=open(filename,'rb',0) 25 rfn=root+filename
26 if root!="/beegfs/common_crawl":
27 if not os.path.exists(rfn):
28 if not os.path.exists(os.path.dirname(rfn)):
29 os.makedirs(os.path.dirname(rfn))
30 with open('/beegfs/common_crawl'+filename,'rb',0) as infile, \
31 open(rfn,'wb',0) as outfile:
32 shutil.copyfileobj(infile,outfile,2048*1024)
33 # while True:
34 # l=infile.readinto(buf)
35 # if l is None:
36 # break
37 # print(l,file=sys.stderr)
38 # outfile.write(memoryview(buf)[:l])
39 infile.close()
40 outfile.close()
41 file=open(rfn,'rb',0)
26 if whole: 42 if whole:
27 file.seek(offset) 43 file.seek(offset)
28 bv=memoryview(buf)[:length] 44 bv=memoryview(buf)[:length]
29 nb=file.readinto(bv) 45 nb=file.readinto(bv)
30 if nb!=length: 46 if nb!=length:
54 parser.add_argument('-h','--headers',help='output HTTP headers', 70 parser.add_argument('-h','--headers',help='output HTTP headers',
55 action='store_true') 71 action='store_true')
56 parser.add_argument('-b','--body',help='output HTTP body', 72 parser.add_argument('-b','--body',help='output HTTP body',
57 action='store_true') 73 action='store_true')
58 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') 74 parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
75 parser.add_argument('-r','--root',nargs='?',
76 help='File path root, create a copy there if necessary',
77 default='/beegfs/common_crawl'),
59 sg=parser.add_mutually_exclusive_group() 78 sg=parser.add_mutually_exclusive_group()
60 sg.add_argument('-x','--index', 79 sg.add_argument('-x','--index',
61 help='take lines of triples from a cdx index file as input', 80 help='take lines of triples from a cdx index file as input',
62 action='store_true') 81 action='store_true')
63 sg.add_argument('length',type=int, 82 sg.add_argument('length',type=int,
81 100
82 buf=bytearray(2024*1024) 101 buf=bytearray(2024*1024)
83 102
84 whole=not (pa.warc or pa.headers or pa.body) 103 whole=not (pa.warc or pa.headers or pa.body)
85 if pa.length is not None: 104 if pa.length is not None:
86 process(pa,buf,FPAT%tuple(pa.filename.split('/')),pa.offset,pa.length,whole) 105 process(pa,buf,pa.root,FPAT%list(pa.filename.split('/')),
106 pa.offset,pa.length,whole)
87 exit(0) 107 exit(0)
88 if pa.index: 108 if pa.index:
89 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') 109 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
90 for l in sys.stdin: 110 for l in sys.stdin:
91 m=CDX.search(l) 111 m=CDX.search(l)
92 if m is None: 112 if m is None:
93 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) 113 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
94 exit(2) 114 exit(2)
95 f=FPAT%(m[3:6]) 115 f=FPAT%(m[3:6])
96 process(pa,buf,f,int(m[2]),int(m[1]),whole) 116 process(pa,buf,pa.root,f,
117 int(m[2]),int(m[1]),whole)
97 exit(0) 118 exit(0)
98 if __name__ == "__main__": 119 if __name__ == "__main__":
99 main() 120 main()