comparison bin/ix.py @ 137:bb0153be65b5

add cl arg --fpath replacing FPAT, which is now default value
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 21 Jul 2021 20:05:42 +0000
parents b51d65ed6c89
children
comparison
equal deleted inserted replaced
136:f30a1b268cea 137:bb0153be65b5
2 '''Extract request records from Common Crawl WARC-format files 2 '''Extract request records from Common Crawl WARC-format files
3 given length, offset and filename triples. 3 given length, offset and filename triples.
4 Input one triple on command line, or 4 Input one triple on command line, or
5 triples from stdin as tab-delimited lines 5 triples from stdin as tab-delimited lines
6 or complete cdx index lines. 6 or complete cdx index lines.
7 In all cases by 'filename' is meant crawlid/segmentid/filename 7 In all cases by 'filename' is meant crawlid/segmentid/type/filename
8 8
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' 9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
10 10
11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex 11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex
12 from isal import igzip 12 from isal import igzip
144 parser = argparse.ArgumentParser( 144 parser = argparse.ArgumentParser(
145 description='''Extract records from warc files given length, offset and file triples. 145 description='''Extract records from warc files given length, offset and file triples.
146 Input one triple on command line, or 146 Input one triple on command line, or
147 triples from stdin as tab-delimited lines 147 triples from stdin as tab-delimited lines
148 or complete cdx index lines. 148 or complete cdx index lines.
149 In all cases by 'filename' is meant crawlid/segmentid/filename''', 149 In all cases by 'filename' is meant crawlid/segmentid/type/filename''',
150 epilog='''Note that if no output flag(s) is/are given, 150 epilog='''Note that if no output flag(s) is/are given,
151 the whole WARC record will be output, more efficiently than 151 the whole WARC record will be output, more efficiently than
152 would be the case if all three flags were given.''', 152 would be the case if all three flags were given.''',
153 add_help=False, 153 add_help=False,
154 conflict_handler='resolve', 154 conflict_handler='resolve',
155 formatter_class=HackFormat 155 formatter_class=HackFormat
156 ) 156 )
157 157 fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s')
158 parser.add_argument('--help',help='Show help',action='help') 158 parser.add_argument('--help',help='Show help',action='help')
159 parser.add_argument('-d','--debug',help='Debug output',action='store_true') 159 parser.add_argument('-d','--debug',help='Debug output',action='store_true')
160 parser.add_argument('-w','--warc',help='output WARC headers', 160 parser.add_argument('-w','--warc',help='output WARC headers',
161 action='store_true') 161 action='store_true')
162 parser.add_argument('-h','--headers',help='output HTTP headers', 162 parser.add_argument('-h','--headers',help='output HTTP headers',
163 action='store_true') 163 action='store_true')
164 parser.add_argument('-b','--body',help='output HTTP body', 164 parser.add_argument('-b','--body',help='output HTTP body',
165 action='store_true') 165 action='store_true')
166 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') 166 parser.add_argument('-c','--cmd',help='pipes each result thru CMD')
167 parser.add_argument('-f','--fpath',
168 help=fphelp,
169 default=FPAT)
167 parser.add_argument('-r','--root',nargs='?', 170 parser.add_argument('-r','--root',nargs='?',
168 help='File path root, create a copy there if necessary', 171 help='File path root, create a copy there if necessary',
169 default='/beegfs/common_crawl'), 172 default='/beegfs/common_crawl'),
170 parser.add_argument('-z','--zipped', 173 parser.add_argument('-z','--zipped',
171 help="output raw gzipped record, ignored if any of -bhw supplied", 174 help="output raw gzipped record, ignored if any of -bhw supplied",
207 if m is None: 210 if m is None:
208 if l.find('/robotstxt/')>-1: 211 if l.find('/robotstxt/')>-1:
209 continue 212 continue
210 print("index line problem: \"%s\""%l,file=sys.stderr,end='') 213 print("index line problem: \"%s\""%l,file=sys.stderr,end='')
211 exit(2) 214 exit(2)
212 f=FPAT%(m[3:7]) 215 f=pa.fpath%(m[3:7])
213 try: 216 try:
214 process(pa,buf,f, 217 process(pa,buf,f,
215 int(m[2]),int(m[1]),whole) 218 int(m[2]),int(m[1]),whole)
216 except Exception as e: 219 except Exception as e:
217 print("Process fail: %s, input line:\n %s"%(e,l), 220 print("Process fail: %s, input line:\n %s"%(e,l),
218 file=sys.stderr,end='') 221 file=sys.stderr,end='')
219 exit(3) 222 exit(3)
220 elif pa.length is not None: 223 elif pa.length is not None:
221 print(pa.filename,file=sys.stderr) 224 print(pa.filename,file=sys.stderr)
222 process(pa,buf,FPAT%tuple(pa.filename.split('/')), 225 process(pa,buf,pa.fpath%tuple(pa.filename.split('/')),
223 pa.offset,pa.length,whole) 226 pa.offset,pa.length,whole)
224 else: 227 else:
225 print("Reading length, offset, filename tab-delimited triples from stdin...", 228 print("Reading length, offset, filename tab-delimited triples from stdin...",
226 file=sys.stderr) 229 file=sys.stderr)
227 for l in sys.stdin: 230 for l in sys.stdin:
229 (length,offset,filename)=l.rstrip().split('\t') 232 (length,offset,filename)=l.rstrip().split('\t')
230 length=int(length) 233 length=int(length)
231 offset=int(offset) 234 offset=int(offset)
232 except ValueError as e: 235 except ValueError as e:
233 parser.error('Invalid input line: %s\n "%s"'%(e,l)) 236 parser.error('Invalid input line: %s\n "%s"'%(e,l))
234 process(pa,buf,FPAT%tuple(filename.split('/')), 237 process(pa,buf,pa.fpath%tuple(filename.split('/')),
235 offset,length,whole) 238 offset,length,whole)
236 239
237 if __name__ == "__main__": 240 if __name__ == "__main__":
238 main() 241 main()