comparison bin/ix.py @ 94:d60073ec798a

just strugging with argparse
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Apr 2021 19:22:27 +0000
parents
children 2b880f2ce894
comparison
equal deleted inserted replaced
93:4d870a7ec871 94:d60073ec798a
1 #!/usr/bin/env python
2 '''Extract request records from Common Crawl WARC-format files
3 given length, offset and file triples.
4 Input one triple on command line, or
5 triples from stdin as tab-delimited lines
6 or complete cdx index lines.
7
8 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if all three flags were given.'''
9
10 import argparse
11
12 parser = argparse.ArgumentParser(
13 description='''Extract records from warc files given length, offset and file triples.
14 Input one triple on command line, or
15 triples from stdin as tab-delimited lines
16 or complete cdx index lines.''',
17 epilog='''[ -x | length option file]
18
19 Note that if no output flag(s) is/are given,
20 the whole WARC record will be output, more efficiently than
21 would be the case if all three flags were given.''',
22 add_help=False,
23 conflict_handler='resolve',
24 formatter_class=argparse.RawDescriptionHelpFormatter
25 )
26
27 parser.add_argument('--help',help='Show help',action='store_true')
28 parser.add_argument('-d','--debug',help='Debug output',action='store_true')
29 parser.add_argument('-w','--warc',help='output WARC headers',
30 action='store_true')
31 parser.add_argument('-h','--headers',help='output HTTP headers',
32 action='store_true')
33 parser.add_argument('-b','--body',help='output HTTP body',
34 action='store_true')
35 parser.add_argument('-e','--exec',help='pipes each result thru EXEC')
36 sg=parser.add_mutually_exclusive_group()
37 sg.add_argument('-x','--index',
38 help='take lines of triples from a cdx index file as input',
39 action='store_true')
40 tg=sg.add_argument_group('triple','explicit triple')
41 tg.add_argument('length',type=int,
42 help='length in bytes of gzipped record')
43 tg.add_argument('offset',type=int,
44 help='start position in bytes of gzipped record')
45 tg.add_argument('filename',
46 help='name of gzipped Common Crawl WARC-format file')
47
48 parser.print_help()