Mercurial > hg > cc > cirrus_home
changeset 137:bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 21 Jul 2021 20:05:42 +0000 |
parents | f30a1b268cea |
children | 9ea12f7b304b |
files | bin/ix.py |
diffstat | 1 files changed, 9 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.py Wed Jul 21 20:04:11 2021 +0000 +++ b/bin/ix.py Wed Jul 21 20:05:42 2021 +0000 @@ -4,7 +4,7 @@ Input one triple on command line, or triples from stdin as tab-delimited lines or complete cdx index lines. -In all cases by 'filename' is meant crawlid/segmentid/filename +In all cases by 'filename' is meant crawlid/segmentid/type/filename Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' @@ -146,7 +146,7 @@ Input one triple on command line, or triples from stdin as tab-delimited lines or complete cdx index lines. - In all cases by 'filename' is meant crawlid/segmentid/filename''', + In all cases by 'filename' is meant crawlid/segmentid/type/filename''', epilog='''Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if all three flags were given.''', @@ -154,7 +154,7 @@ conflict_handler='resolve', formatter_class=HackFormat ) - + fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') parser.add_argument('--help',help='Show help',action='help') parser.add_argument('-d','--debug',help='Debug output',action='store_true') parser.add_argument('-w','--warc',help='output WARC headers', @@ -164,6 +164,9 @@ parser.add_argument('-b','--body',help='output HTTP body', action='store_true') parser.add_argument('-c','--cmd',help='pipes each result thru CMD') + parser.add_argument('-f','--fpath', + help=fphelp, + default=FPAT) parser.add_argument('-r','--root',nargs='?', help='File path root, create a copy there if necessary', default='/beegfs/common_crawl'), @@ -209,7 +212,7 @@ continue print("index line problem: \"%s\""%l,file=sys.stderr,end='') exit(2) - f=FPAT%(m[3:7]) + f=pa.fpath%(m[3:7]) try: process(pa,buf,f, int(m[2]),int(m[1]),whole) @@ -219,7 +222,7 @@ exit(3) elif pa.length is not None: print(pa.filename,file=sys.stderr) - process(pa,buf,FPAT%tuple(pa.filename.split('/')), + process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), pa.offset,pa.length,whole) else: print("Reading length, offset, filename tab-delimited triples from stdin...", @@ -231,7 +234,7 @@ offset=int(offset) except ValueError as e: parser.error('Invalid input line: %s\n "%s"'%(e,l)) - process(pa,buf,FPAT%tuple(filename.split('/')), + process(pa,buf,pa.fpath%tuple(filename.split('/')), offset,length,whole) if __name__ == "__main__":