Mercurial > hg > cc > cirrus_home
annotate bin/ix.py @ 104:61122560ae0c
-x barely working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 19 Apr 2021 18:09:51 +0000 |
parents | f7623dbd8eb5 |
children | baf56ff538f8 |
rev | line source |
---|---|
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Extract request records from Common Crawl WARC-format files |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 given length, offset and file triples. |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 Input one triple on command line, or |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 triples from stdin as tab-delimited lines |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 or complete cdx index lines. |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
8 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
10 import sys, argparse, regex |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
11 |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
12 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
14 class HackFormat(argparse.RawDescriptionHelpFormatter): |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
15 def format_help(self): |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
16 global FOO |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
17 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
18 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
19 FOO) |
100 | 20 |
104 | 21 def process(options,buf,file,offset,length,whole): |
100 | 22 if whole: |
23 file.seek(offset) | |
24 bv=memoryview(buf)[:length] | |
25 nb=file.readinto(bv) | |
26 if nb!=length: | |
104 | 27 print("losing",file.name,length,nb,file=sys.stderr) |
100 | 28 exit(1) |
29 sys.stdout.buffer.write(bv) | |
104 | 30 file.close() |
100 | 31 |
32 def main(): | |
33 parser = argparse.ArgumentParser( | |
34 description='''Extract records from warc files given length, offset and file triples. | |
35 Input one triple on command line, or | |
36 triples from stdin as tab-delimited lines | |
37 or complete cdx index lines.''', | |
38 epilog='''Note that if no output flag(s) is/are given, | |
39 the whole WARC record will be output, more efficiently than | |
40 would be the case if all three flags were given.''', | |
41 add_help=False, | |
42 conflict_handler='resolve', | |
43 formatter_class=HackFormat | |
44 ) | |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
45 |
100 | 46 parser.add_argument('--help',help='Show help',action='help') |
47 parser.add_argument('-d','--debug',help='Debug output',action='store_true') | |
48 parser.add_argument('-w','--warc',help='output WARC headers', | |
49 action='store_true') | |
50 parser.add_argument('-h','--headers',help='output HTTP headers', | |
51 action='store_true') | |
52 parser.add_argument('-b','--body',help='output HTTP body', | |
53 action='store_true') | |
54 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | |
55 sg=parser.add_mutually_exclusive_group() | |
56 sg.add_argument('-x','--index', | |
57 help='take lines of triples from a cdx index file as input', | |
58 action='store_true') | |
59 sg.add_argument('length',type=int, | |
60 help='length in bytes of gzipped record', | |
61 nargs='?') | |
62 parser.add_argument('offset',type=int, | |
63 help='start position in bytes of gzipped record', | |
64 nargs='?') | |
65 parser.add_argument('filename', | |
66 help='name of gzipped Common Crawl WARC-format file', | |
67 nargs='?', | |
68 type=argparse.FileType('rb',0)) | |
69 # Hack the order of optional and positional in the help output | |
70 parser._action_groups.sort(key=lambda g:g.title) | |
71 #parser.print_help() | |
72 pa=parser.parse_args(sys.argv[1:]) | |
104 | 73 #print(pa,file=sys.stderr) |
100 | 74 if pa.length is not None: |
75 # We have to enforce our own check.. | |
76 if pa.offset is None or pa.filename is None: | |
77 parser.error("length, offset and filename must all be supplied together") | |
78 | |
79 buf=bytearray(1024*1024) | |
80 | |
104 | 81 whole=not (pa.warc or pa.headers or pa.body) |
100 | 82 if pa.length is not None: |
104 | 83 process(pa,buf,pa.filename,pa.offset,pa.length,whole) |
100 | 84 exit(0) |
104 | 85 if pa.index: |
86 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') | |
87 for l in sys.stdin: | |
88 m=CDX.search(l) | |
89 if m is None: | |
90 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) | |
91 exit(2) | |
92 f="%s/%s/%s"%(m[3],m[4],m[5]) | |
93 process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole) | |
94 exit(0) | |
100 | 95 if __name__ == "__main__": |
96 main() |