Mercurial > hg > cc > cirrus_home
annotate bin/ix.py @ 121:863ea87be6bb
support field edit
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Jun 2021 15:40:10 +0000 |
parents | bc958b776fb8 |
children | 5b0ec642ee9b |
rev | line source |
---|---|
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
1 #!/usr/bin/env python3 |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Extract request records from Common Crawl WARC-format files |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
3 given length, offset and filename triples. |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 Input one triple on command line, or |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 triples from stdin as tab-delimited lines |
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 or complete cdx index lines. |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
7 In all cases by 'filename' is meant crawlid/segmentid/filename |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
119 | 11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex |
110 | 12 from isal import igzip |
119 | 13 from subprocess import Popen, PIPE |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
14 #import asyncio |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
15 |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
17 BINOUT=sys.stdout.buffer |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
18 FPAT="/%s/%s/orig/warc/%s" |
94
d60073ec798a
just strugging with argparse
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
20 class HackFormat(argparse.RawDescriptionHelpFormatter): |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
21 def format_help(self): |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
22 global FOO |
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
23 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
98
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
24 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
1a4c5fdc2923
help format hacking done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
25 FOO) |
100 | 26 |
119 | 27 def process(options,buf,filename,offset,length,whole): |
28 global CMD_PROC, BINOUT | |
29 if options.cmd: | |
30 CMD_PROC=Popen(shlex.split(options.cmd),stdin=PIPE,bufsize=0) | |
31 BINOUT=CMD_PROC.stdin | |
32 process1(options,buf,filename,offset,length,whole) | |
33 if options.cmd: | |
34 # Wind up subproc | |
35 BINOUT.close() | |
36 if CMD_PROC.wait()!=0: # could/should be async? | |
37 print("subproc of %s:%s:%s failed with %s"%(length,offset,filename, | |
38 CMD_PROC.returncode), | |
39 file=sys.stderr) | |
40 | |
41 def _output_stdout(buf): | |
42 BINOUT.write(buf) | |
43 | |
44 def _output_subproc(buf): | |
45 toWrite=len(buf) | |
46 while toWrite>0: | |
47 toWrite -= BINOUT.write(buf) | |
48 | |
49 def process1(options,buf,filename,offset,length,whole): | |
50 root=options.root | |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
51 rfn=root+filename |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
52 if root!="/beegfs/common_crawl": |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
53 # Support using ramdisk or other local disk as a faster cached |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
54 if not os.path.exists(rfn): |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
55 if not os.path.exists(os.path.dirname(rfn)): |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
56 os.makedirs(os.path.dirname(rfn)) |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
57 with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
58 io.FileIO(rfn,'w') as outfile: |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
59 #shutil.copyfileobj(infile,outfile,128*1024*1024) |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
60 while True: |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
61 l=infile.readinto(buf) |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
62 if l==0: |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
63 break |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
64 outfile.write(memoryview(buf)[:l]) |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
65 file=open(rfn,'rb',0) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
66 file.seek(offset) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
67 bv=memoryview(buf)[:length] |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
68 nb=file.readinto(bv) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
69 file.close() |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
70 if nb!=length: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
71 print("losing",file.name,length,nb,file=sys.stderr) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
72 if whole and options.zipped: |
119 | 73 _output(bv) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
74 return |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
75 gzip_chunk = io.BytesIO(bv) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
76 uv=memoryview(buf)[length:] |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
77 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
78 ll=0 |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
79 while True: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
80 l=gzip_fin.readinto(uv) |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
81 if not l: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
82 break |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
83 ll+=l |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
84 cb=memoryview(uv)[:ll] |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
85 if whole: |
119 | 86 _output(cb) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
87 return |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
88 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
89 state=0 |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
90 tr=None # Was this record truncated? |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
91 bl=None # for HTTP Content-Length for the length of the body? |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
92 with io.BytesIO(cb) as clear_text: |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
93 for L in clear_text: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
94 if state==0: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
95 # WARC header |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
96 if L.startswith(b"Content-Length: "): |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
97 wl=int(L[16:].rstrip()) |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
98 elif L.startswith(b"WARC-Truncated: "): |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
99 tr=L[16:].rstrip() |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
100 tr="EMPTY" if tr=="" else tr |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
101 elif L==b"" or L.startswith(b"\r"): # for idempotency |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
102 # Blank line, WARC header is finished |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
103 if not (options.headers or options.body): |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
104 return |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
105 state=1 |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
106 # Note we preserve the empty line |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
107 if options.warc: |
119 | 108 _output(L) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
109 continue |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
110 if state==1: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
111 # HTTP header |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
112 wl -= len(L) |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
113 if not (L==b"" or L.startswith(b"\r")): |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
114 # Non-blank, it's a header |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
115 if bl is None and L.startswith(b"Content-Length: "): |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
116 bl=int(L[16:].rstrip()) |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
117 if options.headers: |
119 | 118 _output(L) |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
119 else: |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
120 # Blank line, HTTP header is finished |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
121 if not options.body: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
122 return |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
123 if options.headers: |
119 | 124 _output(L) |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
125 state=2 |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
126 # The above is just for sanity, because we do _not_ |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
127 # continue with the outer loop, |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
128 # since we can now block-output the entire rest of the |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
129 # input buffer. |
111
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
130 if bl is not None: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
131 if bl!=wl: |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
132 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ |
3119bca71181
warc and headers parts working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
110
diff
changeset
|
133 (length,offset,filename,wl,bl,tr),file=sys.stderr) |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
134 # HTTP body |
117
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
135 balance=clear_text.tell() |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
136 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) |
63898fde9751
refactor final processing loop,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
114
diff
changeset
|
137 # Output whatever is left |
119 | 138 _output(cb[balance:balance+wl]) |
114
6467024cd072
all parts working, idempotency achieved
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
139 return |
100 | 140 |
141 def main(): | |
119 | 142 global _output |
100 | 143 parser = argparse.ArgumentParser( |
144 description='''Extract records from warc files given length, offset and file triples. | |
145 Input one triple on command line, or | |
146 triples from stdin as tab-delimited lines | |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
147 or complete cdx index lines. |
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
148 In all cases by 'filename' is meant crawlid/segmentid/filename''', |
100 | 149 epilog='''Note that if no output flag(s) is/are given, |
150 the whole WARC record will be output, more efficiently than | |
151 would be the case if all three flags were given.''', | |
152 add_help=False, | |
153 conflict_handler='resolve', | |
154 formatter_class=HackFormat | |
155 ) | |
97
2b880f2ce894
basic help format hacking works
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
156 |
100 | 157 parser.add_argument('--help',help='Show help',action='help') |
158 parser.add_argument('-d','--debug',help='Debug output',action='store_true') | |
159 parser.add_argument('-w','--warc',help='output WARC headers', | |
160 action='store_true') | |
161 parser.add_argument('-h','--headers',help='output HTTP headers', | |
162 action='store_true') | |
163 parser.add_argument('-b','--body',help='output HTTP body', | |
164 action='store_true') | |
165 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') | |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
166 parser.add_argument('-r','--root',nargs='?', |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
167 help='File path root, create a copy there if necessary', |
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
168 default='/beegfs/common_crawl'), |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
169 parser.add_argument('-z','--zipped', |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
170 help="output raw gzipped record, ignored if any of -bhw supplied", |
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
171 action='store_true') |
100 | 172 sg=parser.add_mutually_exclusive_group() |
173 sg.add_argument('-x','--index', | |
174 help='take lines of triples from a cdx index file as input', | |
175 action='store_true') | |
176 sg.add_argument('length',type=int, | |
177 help='length in bytes of gzipped record', | |
178 nargs='?') | |
179 parser.add_argument('offset',type=int, | |
180 help='start position in bytes of gzipped record', | |
181 nargs='?') | |
182 parser.add_argument('filename', | |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
183 help='pathname of gzipped Common Crawl WARC-format file', |
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
184 nargs='?') |
100 | 185 # Hack the order of optional and positional in the help output |
186 parser._action_groups.sort(key=lambda g:g.title) | |
187 #parser.print_help() | |
188 pa=parser.parse_args(sys.argv[1:]) | |
106
815b33c3254a
working with -x and rich directory structure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
105
diff
changeset
|
189 #print(pa,file=sys.stderr) |
100 | 190 if pa.length is not None: |
191 # We have to enforce our own check.. | |
192 if pa.offset is None or pa.filename is None: | |
193 parser.error("length, offset and filename must all be supplied together") | |
194 | |
108
9e5b117dc461
using Popen to run igzip (also not great)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
107
diff
changeset
|
195 buf=bytearray(128*1024*1024) |
100 | 196 |
104 | 197 whole=not (pa.warc or pa.headers or pa.body) |
119 | 198 if pa.cmd: |
199 _output = _output_subproc | |
200 else: | |
201 _output = _output_stdout | |
104 | 202 if pa.index: |
203 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') | |
204 for l in sys.stdin: | |
205 m=CDX.search(l) | |
206 if m is None: | |
207 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) | |
208 exit(2) | |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
209 f=FPAT%(m[3:6]) |
119 | 210 process(pa,buf,f, |
107
007f35b9df9c
added support for copying to/using /dev/shm or /tmp
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
106
diff
changeset
|
211 int(m[2]),int(m[1]),whole) |
119 | 212 elif pa.length is not None: |
213 print(pa.filename,file=sys.stderr) | |
214 process(pa,buf,FPAT%tuple(pa.filename.split('/')), | |
215 pa.offset,pa.length,whole) | |
216 else: | |
217 print("Reading length, offset, filename tab-delimited triples from stdin...", | |
218 file=sys.stderr) | |
219 for l in sys.stdin: | |
220 try: | |
221 (length,offset,filename)=l.rstrip().split('\t') | |
222 length=int(length) | |
223 offset=int(offset) | |
224 except ValueError as e: | |
225 parser.error('Invalid input line: %s\n "%s"'%(e,l)) | |
226 process(pa,buf,FPAT%tuple(filename.split('/')), | |
227 offset,length,whole) | |
228 | |
100 | 229 if __name__ == "__main__": |
230 main() |