Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 100:f7623dbd8eb5
bare minimum working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 18 Apr 2021 17:03:45 +0000 |
parents | d48537c4cbae |
children | 61122560ae0c |
comparison
equal
deleted
inserted
replaced
99:d48537c4cbae | 100:f7623dbd8eb5 |
---|---|
15 def format_help(self): | 15 def format_help(self): |
16 global FOO | 16 global FOO |
17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) | 17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', | 18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
19 FOO) | 19 FOO) |
20 | |
21 parser = argparse.ArgumentParser( | |
22 description='''Extract records from warc files given length, offset and file triples. | |
23 Input one triple on command line, or | |
24 triples from stdin as tab-delimited lines | |
25 or complete cdx index lines.''', | |
26 epilog='''Note that if no output flag(s) is/are given, | |
27 the whole WARC record will be output, more efficiently than | |
28 would be the case if all three flags were given.''', | |
29 add_help=False, | |
30 conflict_handler='resolve', | |
31 formatter_class=HackFormat | |
32 ) | |
33 | |
34 parser.add_argument('--help',help='Show help',action='help') | |
35 parser.add_argument('-d','--debug',help='Debug output',action='store_true') | |
36 parser.add_argument('-w','--warc',help='output WARC headers', | |
37 action='store_true') | |
38 parser.add_argument('-h','--headers',help='output HTTP headers', | |
39 action='store_true') | |
40 parser.add_argument('-b','--body',help='output HTTP body', | |
41 action='store_true') | |
42 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | |
43 sg=parser.add_mutually_exclusive_group() | |
44 sg.add_argument('-x','--index', | |
45 help='take lines of triples from a cdx index file as input', | |
46 action='store_true') | |
47 sg.add_argument('length',type=int, | |
48 help='length in bytes of gzipped record', | |
49 nargs='?') | |
50 parser.add_argument('offset',type=int, | |
51 help='start position in bytes of gzipped record', | |
52 nargs='?') | |
53 parser.add_argument('filename', | |
54 help='name of gzipped Common Crawl WARC-format file', | |
55 nargs='?', | |
56 type=argparse.FileType('rb',0)) | |
57 # Hack the order of optional and positional in the help output | |
58 parser._action_groups.sort(key=lambda g:g.title) | |
59 #parser.print_help() | |
60 pa=parser.parse_args(sys.argv[1:]) | |
61 # We have to enforce our own check.. | |
62 print(pa,file=sys.stderr) | |
63 if pa.length is not None and ( | |
64 pa.offset is None or pa.filename is None): | |
65 parser.error("length, offset and filename must all be supplied together") | |
66 | |
67 buf=bytearray(1024*1024) | |
68 | 20 |
21 def process(options,buf,file,offset,length): | |
22 whole=not (options.warc or options.headers or options.body) | |
23 if whole: | |
24 file.seek(offset) | |
25 bv=memoryview(buf)[:length] | |
26 nb=file.readinto(bv) | |
27 if nb!=length: | |
28 print("losing",file.name,name,length,nb,file=sys.stderr) | |
29 exit(1) | |
30 sys.stdout.buffer.write(bv) | |
31 | |
32 def main(): | |
33 parser = argparse.ArgumentParser( | |
34 description='''Extract records from warc files given length, offset and file triples. | |
35 Input one triple on command line, or | |
36 triples from stdin as tab-delimited lines | |
37 or complete cdx index lines.''', | |
38 epilog='''Note that if no output flag(s) is/are given, | |
39 the whole WARC record will be output, more efficiently than | |
40 would be the case if all three flags were given.''', | |
41 add_help=False, | |
42 conflict_handler='resolve', | |
43 formatter_class=HackFormat | |
44 ) | |
45 | |
46 parser.add_argument('--help',help='Show help',action='help') | |
47 parser.add_argument('-d','--debug',help='Debug output',action='store_true') | |
48 parser.add_argument('-w','--warc',help='output WARC headers', | |
49 action='store_true') | |
50 parser.add_argument('-h','--headers',help='output HTTP headers', | |
51 action='store_true') | |
52 parser.add_argument('-b','--body',help='output HTTP body', | |
53 action='store_true') | |
54 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | |
55 sg=parser.add_mutually_exclusive_group() | |
56 sg.add_argument('-x','--index', | |
57 help='take lines of triples from a cdx index file as input', | |
58 action='store_true') | |
59 sg.add_argument('length',type=int, | |
60 help='length in bytes of gzipped record', | |
61 nargs='?') | |
62 parser.add_argument('offset',type=int, | |
63 help='start position in bytes of gzipped record', | |
64 nargs='?') | |
65 parser.add_argument('filename', | |
66 help='name of gzipped Common Crawl WARC-format file', | |
67 nargs='?', | |
68 type=argparse.FileType('rb',0)) | |
69 # Hack the order of optional and positional in the help output | |
70 parser._action_groups.sort(key=lambda g:g.title) | |
71 #parser.print_help() | |
72 pa=parser.parse_args(sys.argv[1:]) | |
73 print(pa,file=sys.stderr) | |
74 if pa.length is not None: | |
75 # We have to enforce our own check.. | |
76 if pa.offset is None or pa.filename is None: | |
77 parser.error("length, offset and filename must all be supplied together") | |
78 | |
79 buf=bytearray(1024*1024) | |
80 | |
81 if pa.length is not None: | |
82 process(pa,buf,pa.filename,pa.offset,pa.length) | |
83 exit(0) | |
84 | |
85 if __name__ == "__main__": | |
86 main() |