comparison bin/ix.py @ 97:2b880f2ce894

basic help format hacking works
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 16 Apr 2021 12:55:05 +0000
parents d60073ec798a
children 1a4c5fdc2923
comparison
equal deleted inserted replaced
96:a7e72a254790 97:2b880f2ce894
5 triples from stdin as tab-delimited lines 5 triples from stdin as tab-delimited lines
6 or complete cdx index lines. 6 or complete cdx index lines.
7 7
8 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if all three flags were given.''' 8 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if all three flags were given.'''
9 9
10 import argparse 10 import sys, argparse, regex
11 11
12 HACK=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
13
14 class HackFormat(argparse.RawDescriptionHelpFormatter):
15 def format_help(self):
16 global FOO
17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
18 return HACK.sub('\n [ ( -x | length offset filename ) ]',
19 FOO)
20
12 parser = argparse.ArgumentParser( 21 parser = argparse.ArgumentParser(
13 description='''Extract records from warc files given length, offset and file triples. 22 description='''Extract records from warc files given length, offset and file triples.
14 Input one triple on command line, or 23 Input one triple on command line, or
15 triples from stdin as tab-delimited lines 24 triples from stdin as tab-delimited lines
16 or complete cdx index lines.''', 25 or complete cdx index lines.''',
17 epilog='''[ -x | length option file] 26 epilog='''Note that if no output flag(s) is/are given,
18
19 Note that if no output flag(s) is/are given,
20 the whole WARC record will be output, more efficiently than 27 the whole WARC record will be output, more efficiently than
21 would be the case if all three flags were given.''', 28 would be the case if all three flags were given.''',
22 add_help=False, 29 add_help=False,
23 conflict_handler='resolve', 30 conflict_handler='resolve',
24 formatter_class=argparse.RawDescriptionHelpFormatter 31 formatter_class=HackFormat
25 ) 32 )
26 33
27 parser.add_argument('--help',help='Show help',action='store_true') 34 parser.add_argument('--help',help='Show help',action='help')
28 parser.add_argument('-d','--debug',help='Debug output',action='store_true') 35 parser.add_argument('-d','--debug',help='Debug output',action='store_true')
29 parser.add_argument('-w','--warc',help='output WARC headers', 36 parser.add_argument('-w','--warc',help='output WARC headers',
30 action='store_true') 37 action='store_true')
31 parser.add_argument('-h','--headers',help='output HTTP headers', 38 parser.add_argument('-h','--headers',help='output HTTP headers',
32 action='store_true') 39 action='store_true')
33 parser.add_argument('-b','--body',help='output HTTP body', 40 parser.add_argument('-b','--body',help='output HTTP body',
34 action='store_true') 41 action='store_true')
35 parser.add_argument('-e','--exec',help='pipes each result thru EXEC') 42 parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
36 sg=parser.add_mutually_exclusive_group() 43 sg=parser.add_mutually_exclusive_group()
37 sg.add_argument('-x','--index', 44 sg.add_argument('-x','--index',
38 help='take lines of triples from a cdx index file as input', 45 help='take lines of triples from a cdx index file as input',
39 action='store_true') 46 action='store_true')
40 tg=sg.add_argument_group('triple','explicit triple') 47 sg.add_argument('length',type=int,
41 tg.add_argument('length',type=int, 48 help='length in bytes of gzipped record',
42 help='length in bytes of gzipped record') 49 nargs='?')
43 tg.add_argument('offset',type=int, 50 parser.add_argument('offset',type=int,
44 help='start position in bytes of gzipped record') 51 help='start position in bytes of gzipped record',
45 tg.add_argument('filename', 52 nargs='?')
46 help='name of gzipped Common Crawl WARC-format file') 53 parser.add_argument('filename',
54 help='name of gzipped Common Crawl WARC-format file',
55 nargs='?')
47 56
48 parser.print_help() 57 #parser.print_help()
58 pa=parser.parse_args(sys.argv[1:])
59 # We have to enforce our own check..
60 print(pa)
61