comparison bin/ix.py @ 104:61122560ae0c

-x barely working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 19 Apr 2021 18:09:51 +0000
parents f7623dbd8eb5
children baf56ff538f8
comparison
equal deleted inserted replaced
103:d354e29c91a5 104:61122560ae0c
16 global FOO 16 global FOO
17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) 17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', 18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]',
19 FOO) 19 FOO)
20 20
21 def process(options,buf,file,offset,length): 21 def process(options,buf,file,offset,length,whole):
22 whole=not (options.warc or options.headers or options.body)
23 if whole: 22 if whole:
24 file.seek(offset) 23 file.seek(offset)
25 bv=memoryview(buf)[:length] 24 bv=memoryview(buf)[:length]
26 nb=file.readinto(bv) 25 nb=file.readinto(bv)
27 if nb!=length: 26 if nb!=length:
28 print("losing",file.name,name,length,nb,file=sys.stderr) 27 print("losing",file.name,length,nb,file=sys.stderr)
29 exit(1) 28 exit(1)
30 sys.stdout.buffer.write(bv) 29 sys.stdout.buffer.write(bv)
30 file.close()
31 31
32 def main(): 32 def main():
33 parser = argparse.ArgumentParser( 33 parser = argparse.ArgumentParser(
34 description='''Extract records from warc files given length, offset and file triples. 34 description='''Extract records from warc files given length, offset and file triples.
35 Input one triple on command line, or 35 Input one triple on command line, or
68 type=argparse.FileType('rb',0)) 68 type=argparse.FileType('rb',0))
69 # Hack the order of optional and positional in the help output 69 # Hack the order of optional and positional in the help output
70 parser._action_groups.sort(key=lambda g:g.title) 70 parser._action_groups.sort(key=lambda g:g.title)
71 #parser.print_help() 71 #parser.print_help()
72 pa=parser.parse_args(sys.argv[1:]) 72 pa=parser.parse_args(sys.argv[1:])
73 print(pa,file=sys.stderr) 73 #print(pa,file=sys.stderr)
74 if pa.length is not None: 74 if pa.length is not None:
75 # We have to enforce our own check.. 75 # We have to enforce our own check..
76 if pa.offset is None or pa.filename is None: 76 if pa.offset is None or pa.filename is None:
77 parser.error("length, offset and filename must all be supplied together") 77 parser.error("length, offset and filename must all be supplied together")
78 78
79 buf=bytearray(1024*1024) 79 buf=bytearray(1024*1024)
80 80
81 whole=not (pa.warc or pa.headers or pa.body)
81 if pa.length is not None: 82 if pa.length is not None:
82 process(pa,buf,pa.filename,pa.offset,pa.length) 83 process(pa,buf,pa.filename,pa.offset,pa.length,whole)
83 exit(0) 84 exit(0)
84 85 if pa.index:
86 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
87 for l in sys.stdin:
88 m=CDX.search(l)
89 if m is None:
90 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
91 exit(2)
92 f="%s/%s/%s"%(m[3],m[4],m[5])
93 process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole)
94 exit(0)
85 if __name__ == "__main__": 95 if __name__ == "__main__":
86 main() 96 main()