Mercurial > hg > cc > cirrus_home
changeset 100:f7623dbd8eb5
bare minimum working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 18 Apr 2021 17:03:45 +0000 |
parents | d48537c4cbae |
children | fbca56fabbac |
files | bin/ix.py |
diffstat | 1 files changed, 66 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.py Fri Apr 16 18:28:00 2021 +0000 +++ b/bin/ix.py Sun Apr 18 17:03:45 2021 +0000 @@ -17,52 +17,70 @@ FOO=argparse.RawDescriptionHelpFormatter.format_help(self) return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', FOO) - -parser = argparse.ArgumentParser( - description='''Extract records from warc files given length, offset and file triples. -Input one triple on command line, or -triples from stdin as tab-delimited lines -or complete cdx index lines.''', - epilog='''Note that if no output flag(s) is/are given, -the whole WARC record will be output, more efficiently than -would be the case if all three flags were given.''', - add_help=False, - conflict_handler='resolve', - formatter_class=HackFormat - ) - -parser.add_argument('--help',help='Show help',action='help') -parser.add_argument('-d','--debug',help='Debug output',action='store_true') -parser.add_argument('-w','--warc',help='output WARC headers', - action='store_true') -parser.add_argument('-h','--headers',help='output HTTP headers', - action='store_true') -parser.add_argument('-b','--body',help='output HTTP body', - action='store_true') -parser.add_argument('-c','--cmd',help='pipes each result thru CMD') -sg=parser.add_mutually_exclusive_group() -sg.add_argument('-x','--index', - help='take lines of triples from a cdx index file as input', - action='store_true') -sg.add_argument('length',type=int, - help='length in bytes of gzipped record', - nargs='?') -parser.add_argument('offset',type=int, - help='start position in bytes of gzipped record', - nargs='?') -parser.add_argument('filename', - help='name of gzipped Common Crawl WARC-format file', - nargs='?', - type=argparse.FileType('rb',0)) -# Hack the order of optional and positional in the help output -parser._action_groups.sort(key=lambda g:g.title) -#parser.print_help() -pa=parser.parse_args(sys.argv[1:]) -# We have to enforce our own check.. -print(pa,file=sys.stderr) -if pa.length is not None and ( - pa.offset is None or pa.filename is None): - parser.error("length, offset and filename must all be supplied together") - -buf=bytearray(1024*1024) + +def process(options,buf,file,offset,length): + whole=not (options.warc or options.headers or options.body) + if whole: + file.seek(offset) + bv=memoryview(buf)[:length] + nb=file.readinto(bv) + if nb!=length: + print("losing",file.name,name,length,nb,file=sys.stderr) + exit(1) + sys.stdout.buffer.write(bv) + +def main(): + parser = argparse.ArgumentParser( + description='''Extract records from warc files given length, offset and file triples. + Input one triple on command line, or + triples from stdin as tab-delimited lines + or complete cdx index lines.''', + epilog='''Note that if no output flag(s) is/are given, + the whole WARC record will be output, more efficiently than + would be the case if all three flags were given.''', + add_help=False, + conflict_handler='resolve', + formatter_class=HackFormat + ) + parser.add_argument('--help',help='Show help',action='help') + parser.add_argument('-d','--debug',help='Debug output',action='store_true') + parser.add_argument('-w','--warc',help='output WARC headers', + action='store_true') + parser.add_argument('-h','--headers',help='output HTTP headers', + action='store_true') + parser.add_argument('-b','--body',help='output HTTP body', + action='store_true') + parser.add_argument('-c','--cmd',help='pipes each result thru CMD') + sg=parser.add_mutually_exclusive_group() + sg.add_argument('-x','--index', + help='take lines of triples from a cdx index file as input', + action='store_true') + sg.add_argument('length',type=int, + help='length in bytes of gzipped record', + nargs='?') + parser.add_argument('offset',type=int, + help='start position in bytes of gzipped record', + nargs='?') + parser.add_argument('filename', + help='name of gzipped Common Crawl WARC-format file', + nargs='?', + type=argparse.FileType('rb',0)) + # Hack the order of optional and positional in the help output + parser._action_groups.sort(key=lambda g:g.title) + #parser.print_help() + pa=parser.parse_args(sys.argv[1:]) + print(pa,file=sys.stderr) + if pa.length is not None: + # We have to enforce our own check.. + if pa.offset is None or pa.filename is None: + parser.error("length, offset and filename must all be supplied together") + + buf=bytearray(1024*1024) + + if pa.length is not None: + process(pa,buf,pa.filename,pa.offset,pa.length) + exit(0) + +if __name__ == "__main__": + main()