Mercurial > hg > cc > cirrus_work
annotate bin/ix.py @ 18:046dbe557911
write to tmp file implemented
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 07 Aug 2022 13:58:33 +0100 |
parents | |
children | cbac7dfe2f24 |
rev | line source |
---|---|
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Extract request records from Common Crawl WARC-format files |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 given length, offset and filename triples. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 Input one triple on command line, or |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 triples from stdin as tab-delimited lines |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 or complete cdx index lines. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 In all cases by 'filename' is meant crawlid/segmentid/type/filename |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 from isal import igzip |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 from subprocess import Popen, PIPE |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 #import asyncio |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 BINOUT=sys.stdout.buffer |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 FPAT="/%s/%s/orig/%s/%s" |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 CMD_PROC=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 TMPFILENAME=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 class HackFormat(argparse.RawDescriptionHelpFormatter): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 def format_help(self): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 FOO) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 def process(options,buf,filename,offset,length,whole): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 global CMD_PROC, BINOUT, TMPFILENAME, TMPFILE |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 if options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 (tf,TMPFILENAME)=tempfile.mkstemp() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 TMPFILE=open(tf,mode='wb') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 if options.cmd: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 CMD_PROC=Popen(shlex.split(options.cmd),stdin=PIPE,bufsize=0) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 BINOUT=CMD_PROC.stdin |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 process1(options,buf,filename,offset,length,whole) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 if options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 TMPFILE.close() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 if options.cmd: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 _output_subproc(bytes(TMPFILENAME,'utf-8')) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 _output_subproc(b"\n") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 BINOUT.write(bytes(TMPFILENAME,'utf-8')) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 BINOUT.write(b"\n") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 if options.cmd: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 # Wind up subproc |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 BINOUT.close() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 if CMD_PROC.wait()!=0: # could/should be async? |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 print("subproc of %s:%s:%s failed with %s"%(length,offset,filename, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 CMD_PROC.returncode), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 if options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 # not if async? |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 os.unlink(TMPFILENAME) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 TMPFILENAME=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 elif options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 TMPFILENAME=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 def _output_tmpfile(buf): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 TMPFILE.write(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 def _output_stdout(buf): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 BINOUT.write(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
67 def _output_subproc(buf): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
68 toWrite=len(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 while toWrite>0: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 toWrite -= BINOUT.write(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
71 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 def process1(options,buf,filename,offset,length,whole): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
73 root=options.root |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 rfn=root+filename |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
75 if root!="/beegfs/common_crawl": |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
76 # Support using ramdisk or other local disk as a faster cached |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 if not os.path.exists(rfn): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 if not os.path.exists(os.path.dirname(rfn)): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 os.makedirs(os.path.dirname(rfn)) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 io.FileIO(rfn,'w') as outfile: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 #shutil.copyfileobj(infile,outfile,128*1024*1024) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 while True: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 l=infile.readinto(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 if l==0: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 break |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 outfile.write(memoryview(buf)[:l]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 file=open(rfn,'rb',0) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 file.seek(offset) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 bv=memoryview(buf)[:length] |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 nb=file.readinto(bv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 file.close() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 if nb!=length: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 nb,length,offset)) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 if whole and options.zipped: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 _output(bv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 gzip_chunk = io.BytesIO(bv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 uv=memoryview(buf)[length:] |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 ll=0 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 while True: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 l=gzip_fin.readinto(uv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 if not l: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 break |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 ll+=l |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 cb=memoryview(uv)[:ll] |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 if whole: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 _output(cb) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 state=0 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 tr=None # Was this record truncated? |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 bl=None # for HTTP Content-Length for the length of the body? |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
116 with io.BytesIO(cb) as clear_text: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 for L in clear_text: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 if state==0: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 # WARC header |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 if L.startswith(b"Content-Length: "): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 wl=int(L[16:].rstrip()) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 elif L.startswith(b"WARC-Truncated: "): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 tr=L[16:].rstrip() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
124 tr="EMPTY" if tr=="" else tr |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
125 elif L==b"" or L.startswith(b"\r"): # for idempotency |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
126 # Blank line, WARC header is finished |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
127 if not (options.headers or options.body): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 state=1 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 # Note we preserve the empty line |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
131 if options.warc: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
132 _output(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
133 continue |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
134 if state==1: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
135 # HTTP header |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
136 wl -= len(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
137 if not (L==b"" or L.startswith(b"\r")): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
138 # Non-blank, it's a header |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
139 if bl is None and L.startswith(b"Content-Length: "): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
140 bl=int(L[16:].rstrip()) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
141 if options.headers: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
142 _output(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
143 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
144 # Blank line, HTTP header is finished |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
145 if not options.body: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
146 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
147 if options.headers: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
148 _output(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
149 state=2 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
150 # The above is just for sanity, because we do _not_ |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
151 # continue with the outer loop, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
152 # since we can now block-output the entire rest of the |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
153 # input buffer. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
154 if bl is not None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
155 if bl!=wl: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
156 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
157 (length,offset,filename,wl,bl,tr),file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
158 # HTTP body |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
159 balance=clear_text.tell() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
160 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
161 # Output whatever is left |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
162 _output(cb[balance:balance+wl]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
163 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
164 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 def main(): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
166 global _output,TMPFILE,TMPFILENAME,tempfile |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
167 parser = argparse.ArgumentParser( |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
168 description='''Extract records from warc files given length, offset and file triples. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
169 Input one triple on command line, or |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 triples from stdin as tab-delimited lines |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
171 or complete cdx index lines. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
172 In all cases by 'filename' is meant crawlid/segmentid/type/filename''', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
173 epilog='''Note that if no output flag(s) is/are given, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
174 the whole WARC record will be output, more efficiently than |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
175 would be the case if all three flags were given.''', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
176 add_help=False, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
177 conflict_handler='resolve', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
178 formatter_class=HackFormat |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
179 ) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
180 fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
181 parser.add_argument('--help',help='Show help',action='help') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
182 parser.add_argument('-d','--debug',help='Debug output',action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
183 parser.add_argument('-w','--warc',help='output WARC headers', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
184 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
185 parser.add_argument('-h','--headers',help='output HTTP headers', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
186 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
187 parser.add_argument('-b','--body',help='output HTTP body', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
188 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
189 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
190 parser.add_argument('-m','--module.function',help='module.function to call with a stream'), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
191 parser.add_argument('-s','--save',action='store_true', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
192 help="write to a temporary file and output the name") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
193 parser.add_argument('-f','--fpath', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
194 help=fphelp, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
195 default=FPAT) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
196 parser.add_argument('-r','--root',nargs='?', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
197 help='File path root, create a copy there if necessary', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
198 default='/beegfs/common_crawl'), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
199 parser.add_argument('-z','--zipped', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
200 help="output raw gzipped record, ignored if any of -bhw supplied", |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
201 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
202 sg=parser.add_mutually_exclusive_group() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
203 sg.add_argument('-x','--index', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
204 help='take lines of triples from a cdx index file as input', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
205 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
206 sg.add_argument('length',type=int, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
207 help='length in bytes of gzipped record', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
208 nargs='?') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
209 parser.add_argument('offset',type=int, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
210 help='start position in bytes of gzipped record', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
211 nargs='?') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
212 parser.add_argument('filename', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
213 help='pathname of gzipped Common Crawl WARC-format file', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
214 nargs='?') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
215 # Hack the order of optional and positional in the help output |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
216 parser._action_groups.sort(key=lambda g:g.title) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
217 #parser.print_help() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
218 pa=parser.parse_args(sys.argv[1:]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
219 #print(pa,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
220 if pa.length is not None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
221 # We have to enforce our own check.. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
222 if pa.offset is None or pa.filename is None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
223 parser.error("length, offset and filename must all be supplied together") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
224 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
225 buf=bytearray(128*1024*1024) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
226 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
227 whole=not (pa.warc or pa.headers or pa.body) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
228 if pa.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
229 _output=_output_tmpfile |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
230 import tempfile |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
231 elif pa.cmd: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
232 _output = _output_subproc |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
233 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
234 _output = _output_stdout |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
235 if pa.index: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
236 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
237 for l in sys.stdin: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
238 m=CDX.search(l) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
239 if m is None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
240 if l.find('/robotstxt/')>-1: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
241 continue |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
242 print("index line problem: \"%s\""%l,file=sys.stderr,end='') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
243 exit(2) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
244 f=pa.fpath%(m[3:7]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
245 try: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
246 process(pa,buf,f, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
247 int(m[2]),int(m[1]),whole) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
248 except Exception as e: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
249 if pa.debug: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
250 import traceback |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
251 traceback.print_exc(file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
252 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
253 print("Process fail: %s, input line:\n %s"%(e,l), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
254 file=sys.stderr,end='') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
255 exit(3) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
256 elif pa.length is not None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
257 print(pa.filename,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
258 process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
259 pa.offset,pa.length,whole) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
260 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
261 print("Reading length, offset, filename tab-delimited triples from stdin...", |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
262 file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
263 for l in sys.stdin: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
264 try: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
265 (length,offset,filename)=l.rstrip().split('\t') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
266 length=int(length) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
267 offset=int(offset) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
268 except ValueError as e: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
269 parser.error('Invalid input line: %s\n "%s"'%(e,l)) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
270 process(pa,buf,pa.fpath%tuple(filename.split('/')), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
271 offset,length,whole) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
272 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
273 if __name__ == "__main__": |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
274 main() |