# HG changeset patch # User Henry S. Thompson # Date 1626897942 0 # Node ID bb0153be65b58d6a2e9d6de1f64d08c99057fae1 # Parent f30a1b268cea1a64b39e36cc56c66376ca85a8b1 add cl arg --fpath replacing FPAT, which is now default value diff -r f30a1b268cea -r bb0153be65b5 bin/ix.py --- a/bin/ix.py Wed Jul 21 20:04:11 2021 +0000 +++ b/bin/ix.py Wed Jul 21 20:05:42 2021 +0000 @@ -4,7 +4,7 @@ Input one triple on command line, or triples from stdin as tab-delimited lines or complete cdx index lines. -In all cases by 'filename' is meant crawlid/segmentid/filename +In all cases by 'filename' is meant crawlid/segmentid/type/filename Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' @@ -146,7 +146,7 @@ Input one triple on command line, or triples from stdin as tab-delimited lines or complete cdx index lines. - In all cases by 'filename' is meant crawlid/segmentid/filename''', + In all cases by 'filename' is meant crawlid/segmentid/type/filename''', epilog='''Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if all three flags were given.''', @@ -154,7 +154,7 @@ conflict_handler='resolve', formatter_class=HackFormat ) - + fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') parser.add_argument('--help',help='Show help',action='help') parser.add_argument('-d','--debug',help='Debug output',action='store_true') parser.add_argument('-w','--warc',help='output WARC headers', @@ -164,6 +164,9 @@ parser.add_argument('-b','--body',help='output HTTP body', action='store_true') parser.add_argument('-c','--cmd',help='pipes each result thru CMD') + parser.add_argument('-f','--fpath', + help=fphelp, + default=FPAT) parser.add_argument('-r','--root',nargs='?', help='File path root, create a copy there if necessary', default='/beegfs/common_crawl'), @@ -209,7 +212,7 @@ continue print("index line problem: \"%s\""%l,file=sys.stderr,end='') exit(2) - f=FPAT%(m[3:7]) + f=pa.fpath%(m[3:7]) try: process(pa,buf,f, int(m[2]),int(m[1]),whole) @@ -219,7 +222,7 @@ exit(3) elif pa.length is not None: print(pa.filename,file=sys.stderr) - process(pa,buf,FPAT%tuple(pa.filename.split('/')), + process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), pa.offset,pa.length,whole) else: print("Reading length, offset, filename tab-delimited triples from stdin...", @@ -231,7 +234,7 @@ offset=int(offset) except ValueError as e: parser.error('Invalid input line: %s\n "%s"'%(e,l)) - process(pa,buf,FPAT%tuple(filename.split('/')), + process(pa,buf,pa.fpath%tuple(filename.split('/')), offset,length,whole) if __name__ == "__main__":