Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 104:61122560ae0c
-x barely working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 19 Apr 2021 18:09:51 +0000 |
parents | f7623dbd8eb5 |
children | baf56ff538f8 |
comparison
equal
deleted
inserted
replaced
103:d354e29c91a5 | 104:61122560ae0c |
---|---|
16 global FOO | 16 global FOO |
17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) | 17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', | 18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
19 FOO) | 19 FOO) |
20 | 20 |
21 def process(options,buf,file,offset,length): | 21 def process(options,buf,file,offset,length,whole): |
22 whole=not (options.warc or options.headers or options.body) | |
23 if whole: | 22 if whole: |
24 file.seek(offset) | 23 file.seek(offset) |
25 bv=memoryview(buf)[:length] | 24 bv=memoryview(buf)[:length] |
26 nb=file.readinto(bv) | 25 nb=file.readinto(bv) |
27 if nb!=length: | 26 if nb!=length: |
28 print("losing",file.name,name,length,nb,file=sys.stderr) | 27 print("losing",file.name,length,nb,file=sys.stderr) |
29 exit(1) | 28 exit(1) |
30 sys.stdout.buffer.write(bv) | 29 sys.stdout.buffer.write(bv) |
30 file.close() | |
31 | 31 |
32 def main(): | 32 def main(): |
33 parser = argparse.ArgumentParser( | 33 parser = argparse.ArgumentParser( |
34 description='''Extract records from warc files given length, offset and file triples. | 34 description='''Extract records from warc files given length, offset and file triples. |
35 Input one triple on command line, or | 35 Input one triple on command line, or |
68 type=argparse.FileType('rb',0)) | 68 type=argparse.FileType('rb',0)) |
69 # Hack the order of optional and positional in the help output | 69 # Hack the order of optional and positional in the help output |
70 parser._action_groups.sort(key=lambda g:g.title) | 70 parser._action_groups.sort(key=lambda g:g.title) |
71 #parser.print_help() | 71 #parser.print_help() |
72 pa=parser.parse_args(sys.argv[1:]) | 72 pa=parser.parse_args(sys.argv[1:]) |
73 print(pa,file=sys.stderr) | 73 #print(pa,file=sys.stderr) |
74 if pa.length is not None: | 74 if pa.length is not None: |
75 # We have to enforce our own check.. | 75 # We have to enforce our own check.. |
76 if pa.offset is None or pa.filename is None: | 76 if pa.offset is None or pa.filename is None: |
77 parser.error("length, offset and filename must all be supplied together") | 77 parser.error("length, offset and filename must all be supplied together") |
78 | 78 |
79 buf=bytearray(1024*1024) | 79 buf=bytearray(1024*1024) |
80 | 80 |
81 whole=not (pa.warc or pa.headers or pa.body) | |
81 if pa.length is not None: | 82 if pa.length is not None: |
82 process(pa,buf,pa.filename,pa.offset,pa.length) | 83 process(pa,buf,pa.filename,pa.offset,pa.length,whole) |
83 exit(0) | 84 exit(0) |
84 | 85 if pa.index: |
86 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') | |
87 for l in sys.stdin: | |
88 m=CDX.search(l) | |
89 if m is None: | |
90 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) | |
91 exit(2) | |
92 f="%s/%s/%s"%(m[3],m[4],m[5]) | |
93 process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole) | |
94 exit(0) | |
85 if __name__ == "__main__": | 95 if __name__ == "__main__": |
86 main() | 96 main() |