Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 107:007f35b9df9c
added support for copying to/using /dev/shm or /tmp
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 20 Apr 2021 19:11:57 +0000 |
parents | 815b33c3254a |
children | 9e5b117dc461 |
comparison
equal
deleted
inserted
replaced
106:815b33c3254a | 107:007f35b9df9c |
---|---|
6 or complete cdx index lines. | 6 or complete cdx index lines. |
7 In all cases by 'filename' is meant crawlid/segmentid/filename | 7 In all cases by 'filename' is meant crawlid/segmentid/filename |
8 | 8 |
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' | 9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
10 | 10 |
11 import sys, argparse, regex | 11 import sys, argparse, regex, os, shutil |
12 | 12 |
13 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') | 13 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
14 FPAT="/beegfs/common_crawl/%s/%s/orig/warc/%s" | |
15 BINOUT=sys.stdout.buffer | 14 BINOUT=sys.stdout.buffer |
15 FPAT="/%s/%s/orig/warc/%s" | |
16 | 16 |
17 class HackFormat(argparse.RawDescriptionHelpFormatter): | 17 class HackFormat(argparse.RawDescriptionHelpFormatter): |
18 def format_help(self): | 18 def format_help(self): |
19 global FOO | 19 global FOO |
20 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) | 20 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
21 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', | 21 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
22 FOO) | 22 FOO) |
23 | 23 |
24 def process(options,buf,filename,offset,length,whole): | 24 def process(options,buf,root,filename,offset,length,whole): |
25 file=open(filename,'rb',0) | 25 rfn=root+filename |
26 if root!="/beegfs/common_crawl": | |
27 if not os.path.exists(rfn): | |
28 if not os.path.exists(os.path.dirname(rfn)): | |
29 os.makedirs(os.path.dirname(rfn)) | |
30 with open('/beegfs/common_crawl'+filename,'rb',0) as infile, \ | |
31 open(rfn,'wb',0) as outfile: | |
32 shutil.copyfileobj(infile,outfile,2048*1024) | |
33 # while True: | |
34 # l=infile.readinto(buf) | |
35 # if l is None: | |
36 # break | |
37 # print(l,file=sys.stderr) | |
38 # outfile.write(memoryview(buf)[:l]) | |
39 infile.close() | |
40 outfile.close() | |
41 file=open(rfn,'rb',0) | |
26 if whole: | 42 if whole: |
27 file.seek(offset) | 43 file.seek(offset) |
28 bv=memoryview(buf)[:length] | 44 bv=memoryview(buf)[:length] |
29 nb=file.readinto(bv) | 45 nb=file.readinto(bv) |
30 if nb!=length: | 46 if nb!=length: |
54 parser.add_argument('-h','--headers',help='output HTTP headers', | 70 parser.add_argument('-h','--headers',help='output HTTP headers', |
55 action='store_true') | 71 action='store_true') |
56 parser.add_argument('-b','--body',help='output HTTP body', | 72 parser.add_argument('-b','--body',help='output HTTP body', |
57 action='store_true') | 73 action='store_true') |
58 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | 74 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') |
75 parser.add_argument('-r','--root',nargs='?', | |
76 help='File path root, create a copy there if necessary', | |
77 default='/beegfs/common_crawl'), | |
59 sg=parser.add_mutually_exclusive_group() | 78 sg=parser.add_mutually_exclusive_group() |
60 sg.add_argument('-x','--index', | 79 sg.add_argument('-x','--index', |
61 help='take lines of triples from a cdx index file as input', | 80 help='take lines of triples from a cdx index file as input', |
62 action='store_true') | 81 action='store_true') |
63 sg.add_argument('length',type=int, | 82 sg.add_argument('length',type=int, |
81 | 100 |
82 buf=bytearray(2024*1024) | 101 buf=bytearray(2024*1024) |
83 | 102 |
84 whole=not (pa.warc or pa.headers or pa.body) | 103 whole=not (pa.warc or pa.headers or pa.body) |
85 if pa.length is not None: | 104 if pa.length is not None: |
86 process(pa,buf,FPAT%tuple(pa.filename.split('/')),pa.offset,pa.length,whole) | 105 process(pa,buf,pa.root,FPAT%list(pa.filename.split('/')), |
106 pa.offset,pa.length,whole) | |
87 exit(0) | 107 exit(0) |
88 if pa.index: | 108 if pa.index: |
89 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') | 109 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') |
90 for l in sys.stdin: | 110 for l in sys.stdin: |
91 m=CDX.search(l) | 111 m=CDX.search(l) |
92 if m is None: | 112 if m is None: |
93 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) | 113 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) |
94 exit(2) | 114 exit(2) |
95 f=FPAT%(m[3:6]) | 115 f=FPAT%(m[3:6]) |
96 process(pa,buf,f,int(m[2]),int(m[1]),whole) | 116 process(pa,buf,pa.root,f, |
117 int(m[2]),int(m[1]),whole) | |
97 exit(0) | 118 exit(0) |
98 if __name__ == "__main__": | 119 if __name__ == "__main__": |
99 main() | 120 main() |