changeset 137:bb0153be65b5

add cl arg --fpath replacing FPAT, which is now default value
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 21 Jul 2021 20:05:42 +0000
parents f30a1b268cea
children 9ea12f7b304b
files bin/ix.py
diffstat 1 files changed, 9 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Wed Jul 21 20:04:11 2021 +0000
+++ b/bin/ix.py	Wed Jul 21 20:05:42 2021 +0000
@@ -4,7 +4,7 @@
 Input one triple on command line, or
 triples from stdin as tab-delimited lines
 or complete cdx index lines.
-In all cases by 'filename' is meant crawlid/segmentid/filename
+In all cases by 'filename' is meant crawlid/segmentid/type/filename
 
 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
 
@@ -146,7 +146,7 @@
   Input one triple on command line, or
   triples from stdin as tab-delimited lines
   or complete cdx index lines.
-  In all cases by 'filename' is meant crawlid/segmentid/filename''',
+  In all cases by 'filename' is meant crawlid/segmentid/type/filename''',
     epilog='''Note that if no output flag(s) is/are given,
   the whole WARC record will be output, more efficiently than
   would be the case if all three flags were given.''',
@@ -154,7 +154,7 @@
     conflict_handler='resolve',
     formatter_class=HackFormat
     )
-
+  fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s')
   parser.add_argument('--help',help='Show help',action='help')
   parser.add_argument('-d','--debug',help='Debug output',action='store_true')
   parser.add_argument('-w','--warc',help='output WARC headers',
@@ -164,6 +164,9 @@
   parser.add_argument('-b','--body',help='output HTTP body',
                       action='store_true')
   parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
+  parser.add_argument('-f','--fpath',
+                      help=fphelp,
+                      default=FPAT)
   parser.add_argument('-r','--root',nargs='?',
                   help='File path root, create a copy there if necessary',
                   default='/beegfs/common_crawl'),
@@ -209,7 +212,7 @@
           continue
         print("index line problem: \"%s\""%l,file=sys.stderr,end='')
         exit(2)
-      f=FPAT%(m[3:7])
+      f=pa.fpath%(m[3:7])
       try:
         process(pa,buf,f,
                 int(m[2]),int(m[1]),whole)
@@ -219,7 +222,7 @@
         exit(3)
   elif pa.length is not None:
     print(pa.filename,file=sys.stderr)
-    process(pa,buf,FPAT%tuple(pa.filename.split('/')),
+    process(pa,buf,pa.fpath%tuple(pa.filename.split('/')),
             pa.offset,pa.length,whole)
   else:
     print("Reading length, offset, filename tab-delimited triples from stdin...",
@@ -231,7 +234,7 @@
         offset=int(offset)
       except ValueError as e:
         parser.error('Invalid input line: %s\n "%s"'%(e,l))
-      process(pa,buf,FPAT%tuple(filename.split('/')),
+      process(pa,buf,pa.fpath%tuple(filename.split('/')),
               offset,length,whole)
 
 if __name__ == "__main__":