comparison bin/ix.py @ 105:baf56ff538f8

convert to rich directory structure per 2019-35
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 20 Apr 2021 11:12:35 +0000
parents 61122560ae0c
children 815b33c3254a
comparison
equal deleted inserted replaced
104:61122560ae0c 105:baf56ff538f8
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 '''Extract request records from Common Crawl WARC-format files 2 '''Extract request records from Common Crawl WARC-format files
3 given length, offset and file triples. 3 given length, offset and filename triples.
4 Input one triple on command line, or 4 Input one triple on command line, or
5 triples from stdin as tab-delimited lines 5 triples from stdin as tab-delimited lines
6 or complete cdx index lines. 6 or complete cdx index lines.
7 In all cases by 'filename' is meant crawlid/segmentid/filename
7 8
8 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' 9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
9 10
10 import sys, argparse, regex 11 import sys, argparse, regex
11 12
12 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') 13 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
14 FPAT="/beegfs/common_crawl/%s/%s/orig/warc/%s"
15 BINOUT=sys.stdout.buffer
13 16
14 class HackFormat(argparse.RawDescriptionHelpFormatter): 17 class HackFormat(argparse.RawDescriptionHelpFormatter):
15 def format_help(self): 18 def format_help(self):
16 global FOO 19 global FOO
17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) 20 FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', 21 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]',
19 FOO) 22 FOO)
20 23
21 def process(options,buf,file,offset,length,whole): 24 def process(options,buf,filename,offset,length,whole):
25 file=open(filename,'rb',0)
22 if whole: 26 if whole:
23 file.seek(offset) 27 file.seek(offset)
24 bv=memoryview(buf)[:length] 28 bv=memoryview(buf)[:length]
25 nb=file.readinto(bv) 29 nb=file.readinto(bv)
26 if nb!=length: 30 if nb!=length:
27 print("losing",file.name,length,nb,file=sys.stderr) 31 print("losing",file.name,length,nb,file=sys.stderr)
28 exit(1) 32 BINOUT.write(bv)
29 sys.stdout.buffer.write(bv)
30 file.close() 33 file.close()
31 34
32 def main(): 35 def main():
33 parser = argparse.ArgumentParser( 36 parser = argparse.ArgumentParser(
34 description='''Extract records from warc files given length, offset and file triples. 37 description='''Extract records from warc files given length, offset and file triples.
35 Input one triple on command line, or 38 Input one triple on command line, or
36 triples from stdin as tab-delimited lines 39 triples from stdin as tab-delimited lines
37 or complete cdx index lines.''', 40 or complete cdx index lines.
41 In all cases by 'filename' is meant crawlid/segmentid/filename''',
38 epilog='''Note that if no output flag(s) is/are given, 42 epilog='''Note that if no output flag(s) is/are given,
39 the whole WARC record will be output, more efficiently than 43 the whole WARC record will be output, more efficiently than
40 would be the case if all three flags were given.''', 44 would be the case if all three flags were given.''',
41 add_help=False, 45 add_help=False,
42 conflict_handler='resolve', 46 conflict_handler='resolve',
61 nargs='?') 65 nargs='?')
62 parser.add_argument('offset',type=int, 66 parser.add_argument('offset',type=int,
63 help='start position in bytes of gzipped record', 67 help='start position in bytes of gzipped record',
64 nargs='?') 68 nargs='?')
65 parser.add_argument('filename', 69 parser.add_argument('filename',
66 help='name of gzipped Common Crawl WARC-format file', 70 help='pathname of gzipped Common Crawl WARC-format file',
67 nargs='?', 71 nargs='?')
68 type=argparse.FileType('rb',0))
69 # Hack the order of optional and positional in the help output 72 # Hack the order of optional and positional in the help output
70 parser._action_groups.sort(key=lambda g:g.title) 73 parser._action_groups.sort(key=lambda g:g.title)
71 #parser.print_help() 74 #parser.print_help()
72 pa=parser.parse_args(sys.argv[1:]) 75 pa=parser.parse_args(sys.argv[1:])
73 #print(pa,file=sys.stderr) 76 print(pa,file=sys.stderr)
74 if pa.length is not None: 77 if pa.length is not None:
75 # We have to enforce our own check.. 78 # We have to enforce our own check..
76 if pa.offset is None or pa.filename is None: 79 if pa.offset is None or pa.filename is None:
77 parser.error("length, offset and filename must all be supplied together") 80 parser.error("length, offset and filename must all be supplied together")
78 81
79 buf=bytearray(1024*1024) 82 buf=bytearray(2024*1024)
80 83
81 whole=not (pa.warc or pa.headers or pa.body) 84 whole=not (pa.warc or pa.headers or pa.body)
82 if pa.length is not None: 85 if pa.length is not None:
83 process(pa,buf,pa.filename,pa.offset,pa.length,whole) 86 process(pa,buf,FPAT%tuple(pa.filename.split('/')),pa.offset,pa.length,whole)
84 exit(0) 87 exit(0)
85 if pa.index: 88 if pa.index:
86 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') 89 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
87 for l in sys.stdin: 90 for l in sys.stdin:
88 m=CDX.search(l) 91 m=CDX.search(l)
89 if m is None: 92 if m is None:
90 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) 93 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
91 exit(2) 94 exit(2)
92 f="%s/%s/%s"%(m[3],m[4],m[5]) 95 f=FPAT%(m[3:6])
93 process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole) 96 process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole)
94 exit(0) 97 exit(0)
95 if __name__ == "__main__": 98 if __name__ == "__main__":
96 main() 99 main()