Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 137:bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 21 Jul 2021 20:05:42 +0000 |
parents | b51d65ed6c89 |
children |
comparison
equal
deleted
inserted
replaced
136:f30a1b268cea | 137:bb0153be65b5 |
---|---|
2 '''Extract request records from Common Crawl WARC-format files | 2 '''Extract request records from Common Crawl WARC-format files |
3 given length, offset and filename triples. | 3 given length, offset and filename triples. |
4 Input one triple on command line, or | 4 Input one triple on command line, or |
5 triples from stdin as tab-delimited lines | 5 triples from stdin as tab-delimited lines |
6 or complete cdx index lines. | 6 or complete cdx index lines. |
7 In all cases by 'filename' is meant crawlid/segmentid/filename | 7 In all cases by 'filename' is meant crawlid/segmentid/type/filename |
8 | 8 |
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' | 9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
10 | 10 |
11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex | 11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex |
12 from isal import igzip | 12 from isal import igzip |
144 parser = argparse.ArgumentParser( | 144 parser = argparse.ArgumentParser( |
145 description='''Extract records from warc files given length, offset and file triples. | 145 description='''Extract records from warc files given length, offset and file triples. |
146 Input one triple on command line, or | 146 Input one triple on command line, or |
147 triples from stdin as tab-delimited lines | 147 triples from stdin as tab-delimited lines |
148 or complete cdx index lines. | 148 or complete cdx index lines. |
149 In all cases by 'filename' is meant crawlid/segmentid/filename''', | 149 In all cases by 'filename' is meant crawlid/segmentid/type/filename''', |
150 epilog='''Note that if no output flag(s) is/are given, | 150 epilog='''Note that if no output flag(s) is/are given, |
151 the whole WARC record will be output, more efficiently than | 151 the whole WARC record will be output, more efficiently than |
152 would be the case if all three flags were given.''', | 152 would be the case if all three flags were given.''', |
153 add_help=False, | 153 add_help=False, |
154 conflict_handler='resolve', | 154 conflict_handler='resolve', |
155 formatter_class=HackFormat | 155 formatter_class=HackFormat |
156 ) | 156 ) |
157 | 157 fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') |
158 parser.add_argument('--help',help='Show help',action='help') | 158 parser.add_argument('--help',help='Show help',action='help') |
159 parser.add_argument('-d','--debug',help='Debug output',action='store_true') | 159 parser.add_argument('-d','--debug',help='Debug output',action='store_true') |
160 parser.add_argument('-w','--warc',help='output WARC headers', | 160 parser.add_argument('-w','--warc',help='output WARC headers', |
161 action='store_true') | 161 action='store_true') |
162 parser.add_argument('-h','--headers',help='output HTTP headers', | 162 parser.add_argument('-h','--headers',help='output HTTP headers', |
163 action='store_true') | 163 action='store_true') |
164 parser.add_argument('-b','--body',help='output HTTP body', | 164 parser.add_argument('-b','--body',help='output HTTP body', |
165 action='store_true') | 165 action='store_true') |
166 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | 166 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') |
167 parser.add_argument('-f','--fpath', | |
168 help=fphelp, | |
169 default=FPAT) | |
167 parser.add_argument('-r','--root',nargs='?', | 170 parser.add_argument('-r','--root',nargs='?', |
168 help='File path root, create a copy there if necessary', | 171 help='File path root, create a copy there if necessary', |
169 default='/beegfs/common_crawl'), | 172 default='/beegfs/common_crawl'), |
170 parser.add_argument('-z','--zipped', | 173 parser.add_argument('-z','--zipped', |
171 help="output raw gzipped record, ignored if any of -bhw supplied", | 174 help="output raw gzipped record, ignored if any of -bhw supplied", |
207 if m is None: | 210 if m is None: |
208 if l.find('/robotstxt/')>-1: | 211 if l.find('/robotstxt/')>-1: |
209 continue | 212 continue |
210 print("index line problem: \"%s\""%l,file=sys.stderr,end='') | 213 print("index line problem: \"%s\""%l,file=sys.stderr,end='') |
211 exit(2) | 214 exit(2) |
212 f=FPAT%(m[3:7]) | 215 f=pa.fpath%(m[3:7]) |
213 try: | 216 try: |
214 process(pa,buf,f, | 217 process(pa,buf,f, |
215 int(m[2]),int(m[1]),whole) | 218 int(m[2]),int(m[1]),whole) |
216 except Exception as e: | 219 except Exception as e: |
217 print("Process fail: %s, input line:\n %s"%(e,l), | 220 print("Process fail: %s, input line:\n %s"%(e,l), |
218 file=sys.stderr,end='') | 221 file=sys.stderr,end='') |
219 exit(3) | 222 exit(3) |
220 elif pa.length is not None: | 223 elif pa.length is not None: |
221 print(pa.filename,file=sys.stderr) | 224 print(pa.filename,file=sys.stderr) |
222 process(pa,buf,FPAT%tuple(pa.filename.split('/')), | 225 process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), |
223 pa.offset,pa.length,whole) | 226 pa.offset,pa.length,whole) |
224 else: | 227 else: |
225 print("Reading length, offset, filename tab-delimited triples from stdin...", | 228 print("Reading length, offset, filename tab-delimited triples from stdin...", |
226 file=sys.stderr) | 229 file=sys.stderr) |
227 for l in sys.stdin: | 230 for l in sys.stdin: |
229 (length,offset,filename)=l.rstrip().split('\t') | 232 (length,offset,filename)=l.rstrip().split('\t') |
230 length=int(length) | 233 length=int(length) |
231 offset=int(offset) | 234 offset=int(offset) |
232 except ValueError as e: | 235 except ValueError as e: |
233 parser.error('Invalid input line: %s\n "%s"'%(e,l)) | 236 parser.error('Invalid input line: %s\n "%s"'%(e,l)) |
234 process(pa,buf,FPAT%tuple(filename.split('/')), | 237 process(pa,buf,pa.fpath%tuple(filename.split('/')), |
235 offset,length,whole) | 238 offset,length,whole) |
236 | 239 |
237 if __name__ == "__main__": | 240 if __name__ == "__main__": |
238 main() | 241 main() |