Mercurial > hg > cc > cirrus_home
annotate bin/ix.py @ 137:bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 21 Jul 2021 20:05:42 +0000 |
parents | b51d65ed6c89 |
children |
rev | line source |
---|---|
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
1 #!/usr/bin/env python3 |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Extract request records from Common Crawl WARC-format files |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
3 given length, offset and filename triples. |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 Input one triple on command line, or |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 triples from stdin as tab-delimited lines |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 or complete cdx index lines. |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
7 In all cases by 'filename' is meant crawlid/segmentid/type/filename |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
119 | 11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex |
110 | 12 from isal import igzip |
119 | 13 from subprocess import Popen, PIPE |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
14 #import asyncio |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
15 |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
17 BINOUT=sys.stdout.buffer |
123
5b0ec642ee9b
silently skip robotstxt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
119
diff
changeset
|
18 FPAT="/%s/%s/orig/%s/%s" |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
20 class HackFormat(argparse.RawDescriptionHelpFormatter): |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
21 def format_help(self): |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
22 global FOO |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
23 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
24 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
25 FOO) |
100 | 26 |
119 | 27 def process(options,buf,filename,offset,length,whole): |
28 global CMD_PROC, BINOUT | |
29 if options.cmd: | |
30 CMD_PROC=Popen(shlex.split(options.cmd),stdin=PIPE,bufsize=0) | |
31 BINOUT=CMD_PROC.stdin | |
32 process1(options,buf,filename,offset,length,whole) | |
33 if options.cmd: | |
34 # Wind up subproc | |
35 BINOUT.close() | |
36 if CMD_PROC.wait()!=0: # could/should be async? | |
37 print("subproc of %s:%s:%s failed with %s"%(length,offset,filename, | |
38 CMD_PROC.returncode), | |
39 file=sys.stderr) | |
40 | |
41 def _output_stdout(buf): | |
42 BINOUT.write(buf) | |
43 | |
44 def _output_subproc(buf): | |
45 toWrite=len(buf) | |
46 while toWrite>0: | |
47 toWrite -= BINOUT.write(buf) | |
48 | |
49 def process1(options,buf,filename,offset,length,whole): | |
50 root=options.root | |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
51 rfn=root+filename |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
52 if root!="/beegfs/common_crawl": |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
53 # Support using ramdisk or other local disk as a faster cached |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
54 if not os.path.exists(rfn): |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
55 if not os.path.exists(os.path.dirname(rfn)): |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
56 os.makedirs(os.path.dirname(rfn)) |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
57 with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
58 io.FileIO(rfn,'w') as outfile: |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
59 #shutil.copyfileobj(infile,outfile,128*1024*1024) |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
60 while True: |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
61 l=infile.readinto(buf) |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
62 if l==0: |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
63 break |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
64 outfile.write(memoryview(buf)[:l]) |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
65 file=open(rfn,'rb',0) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
66 file.seek(offset) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
67 bv=memoryview(buf)[:length] |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
68 nb=file.readinto(bv) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
69 file.close() |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
70 if nb!=length: |
129 | 71 raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name, |
72 nb,length,offset)) | |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
73 if whole and options.zipped: |
119 | 74 _output(bv) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
75 return |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
76 gzip_chunk = io.BytesIO(bv) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
77 uv=memoryview(buf)[length:] |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
78 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
79 ll=0 |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
80 while True: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
81 l=gzip_fin.readinto(uv) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
82 if not l: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
83 break |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
84 ll+=l |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
85 cb=memoryview(uv)[:ll] |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
86 if whole: |
119 | 87 _output(cb) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
88 return |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
89 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
90 state=0 |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
91 tr=None # Was this record truncated? |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
92 bl=None # for HTTP Content-Length for the length of the body? |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
93 with io.BytesIO(cb) as clear_text: |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
94 for L in clear_text: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
95 if state==0: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
96 # WARC header |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
97 if L.startswith(b"Content-Length: "): |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
98 wl=int(L[16:].rstrip()) |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
99 elif L.startswith(b"WARC-Truncated: "): |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
100 tr=L[16:].rstrip() |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
101 tr="EMPTY" if tr=="" else tr |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
102 elif L==b"" or L.startswith(b"\r"): # for idempotency |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
103 # Blank line, WARC header is finished |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
104 if not (options.headers or options.body): |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
105 return |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
106 state=1 |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
107 # Note we preserve the empty line |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
108 if options.warc: |
119 | 109 _output(L) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
110 continue |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
111 if state==1: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
112 # HTTP header |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
113 wl -= len(L) |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
114 if not (L==b"" or L.startswith(b"\r")): |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
115 # Non-blank, it's a header |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
116 if bl is None and L.startswith(b"Content-Length: "): |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
117 bl=int(L[16:].rstrip()) |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
118 if options.headers: |
119 | 119 _output(L) |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
120 else: |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
121 # Blank line, HTTP header is finished |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
122 if not options.body: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
123 return |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
124 if options.headers: |
119 | 125 _output(L) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
126 state=2 |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
127 # The above is just for sanity, because we do _not_ |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
128 # continue with the outer loop, |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
129 # since we can now block-output the entire rest of the |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
130 # input buffer. |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
131 if bl is not None: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
132 if bl!=wl: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
133 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
134 (length,offset,filename,wl,bl,tr),file=sys.stderr) |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
135 # HTTP body |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
136 balance=clear_text.tell() |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
137 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
138 # Output whatever is left |
119 | 139 _output(cb[balance:balance+wl]) |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
140 return |
100 | 141 |
142 def main(): | |
119 | 143 global _output |
100 | 144 parser = argparse.ArgumentParser( |
145 description='''Extract records from warc files given length, offset and file triples. | |
146 Input one triple on command line, or | |
147 triples from stdin as tab-delimited lines | |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
148 or complete cdx index lines. |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
149 In all cases by 'filename' is meant crawlid/segmentid/type/filename''', |
100 | 150 epilog='''Note that if no output flag(s) is/are given, |
151 the whole WARC record will be output, more efficiently than | |
152 would be the case if all three flags were given.''', | |
153 add_help=False, | |
154 conflict_handler='resolve', | |
155 formatter_class=HackFormat | |
156 ) | |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
157 fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') |
100 | 158 parser.add_argument('--help',help='Show help',action='help') |
159 parser.add_argument('-d','--debug',help='Debug output',action='store_true') | |
160 parser.add_argument('-w','--warc',help='output WARC headers', | |
161 action='store_true') | |
162 parser.add_argument('-h','--headers',help='output HTTP headers', | |
163 action='store_true') | |
164 parser.add_argument('-b','--body',help='output HTTP body', | |
165 action='store_true') | |
166 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
167 parser.add_argument('-f','--fpath', |
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
168 help=fphelp, |
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
169 default=FPAT) |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
170 parser.add_argument('-r','--root',nargs='?', |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
171 help='File path root, create a copy there if necessary', |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
172 default='/beegfs/common_crawl'), |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
173 parser.add_argument('-z','--zipped', |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
174 help="output raw gzipped record, ignored if any of -bhw supplied", |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
175 action='store_true') |
100 | 176 sg=parser.add_mutually_exclusive_group() |
177 sg.add_argument('-x','--index', | |
178 help='take lines of triples from a cdx index file as input', | |
179 action='store_true') | |
180 sg.add_argument('length',type=int, | |
181 help='length in bytes of gzipped record', | |
182 nargs='?') | |
183 parser.add_argument('offset',type=int, | |
184 help='start position in bytes of gzipped record', | |
185 nargs='?') | |
186 parser.add_argument('filename', | |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
187 help='pathname of gzipped Common Crawl WARC-format file', |
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
188 nargs='?') |
100 | 189 # Hack the order of optional and positional in the help output |
190 parser._action_groups.sort(key=lambda g:g.title) | |
191 #parser.print_help() | |
192 pa=parser.parse_args(sys.argv[1:]) | |
106
815b33c3254a
working with -x and rich directory structure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
105
diff
changeset
|
193 #print(pa,file=sys.stderr) |
100 | 194 if pa.length is not None: |
195 # We have to enforce our own check.. | |
196 if pa.offset is None or pa.filename is None: | |
197 parser.error("length, offset and filename must all be supplied together") | |
198 | |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
199 buf=bytearray(128*1024*1024) |
100 | 200 |
104 | 201 whole=not (pa.warc or pa.headers or pa.body) |
119 | 202 if pa.cmd: |
203 _output = _output_subproc | |
204 else: | |
205 _output = _output_stdout | |
104 | 206 if pa.index: |
123
5b0ec642ee9b
silently skip robotstxt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
119
diff
changeset
|
207 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... |
104 | 208 for l in sys.stdin: |
209 m=CDX.search(l) | |
210 if m is None: | |
123
5b0ec642ee9b
silently skip robotstxt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
119
diff
changeset
|
211 if l.find('/robotstxt/')>-1: |
5b0ec642ee9b
silently skip robotstxt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
119
diff
changeset
|
212 continue |
129 | 213 print("index line problem: \"%s\""%l,file=sys.stderr,end='') |
104 | 214 exit(2) |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
215 f=pa.fpath%(m[3:7]) |
129 | 216 try: |
217 process(pa,buf,f, | |
218 int(m[2]),int(m[1]),whole) | |
219 except Exception as e: | |
220 print("Process fail: %s, input line:\n %s"%(e,l), | |
221 file=sys.stderr,end='') | |
222 exit(3) | |
119 | 223 elif pa.length is not None: |
224 print(pa.filename,file=sys.stderr) | |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
225 process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), |
119 | 226 pa.offset,pa.length,whole) |
227 else: | |
228 print("Reading length, offset, filename tab-delimited triples from stdin...", | |
229 file=sys.stderr) | |
230 for l in sys.stdin: | |
231 try: | |
232 (length,offset,filename)=l.rstrip().split('\t') | |
233 length=int(length) | |
234 offset=int(offset) | |
235 except ValueError as e: | |
236 parser.error('Invalid input line: %s\n "%s"'%(e,l)) | |
137
bb0153be65b5
add cl arg --fpath replacing FPAT, which is now default value
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
129
diff
changeset
|
237 process(pa,buf,pa.fpath%tuple(filename.split('/')), |
119 | 238 offset,length,whole) |
239 | |
100 | 240 if __name__ == "__main__": |
241 main() |