Mercurial > hg > cc > cirrus_work
annotate bin/ix.py @ 109:52c6a9b0fc8c
loosen must-match criterion in the both-messy case
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 19 Sep 2023 19:29:41 +0100 |
parents | fa43c318749b |
children |
rev | line source |
---|---|
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
41 | 2 '''Extract response records from Common Crawl WARC-format files |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 given length, offset and filename triples. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 Input one triple on command line, or |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 triples from stdin as tab-delimited lines |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 or complete cdx index lines. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 In all cases by 'filename' is meant crawlid/segmentid/type/filename |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 import sys, argparse, regex, os, shutil, io, gzip, time, shlex |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 from isal import igzip |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 from subprocess import Popen, PIPE |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 #import asyncio |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 BINOUT=sys.stdout.buffer |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 FPAT="/%s/%s/orig/%s/%s" |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 CMD_PROC=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 TMPFILENAME=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 class HackFormat(argparse.RawDescriptionHelpFormatter): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 def format_help(self): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 FOO) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 def process(options,buf,filename,offset,length,whole): |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
30 try: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
31 process0(options,buf,filename,offset,length,whole) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
32 except Exception as e: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
33 if options.debug: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
34 import traceback |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
35 traceback.print_exc(file=sys.stderr) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
36 else: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
37 print("Process fail: %s, input line:\n %s"%(e,l), |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
38 file=sys.stderr,end='') |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
39 exit(3) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
40 |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
41 def process0(options,buf,filename,offset,length,whole): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
42 global TMPFILENAME, TMPFILE |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 if options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 (tf,TMPFILENAME)=tempfile.mkstemp() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 TMPFILE=open(tf,mode='wb') |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
46 if options.cmd and not options.process: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
47 launch(options.cmd) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 process1(options,buf,filename,offset,length,whole) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 if options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 TMPFILE.close() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 if options.cmd: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 _output_subproc(bytes(TMPFILENAME,'utf-8')) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 _output_subproc(b"\n") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 BINOUT.write(bytes(TMPFILENAME,'utf-8')) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 BINOUT.write(b"\n") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 if options.cmd: |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
58 if not options.process: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
59 windup(filename,options,length) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 if options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 os.unlink(TMPFILENAME) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 TMPFILENAME=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 elif options.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 print("%s will need to be deleted"%TMPFILENAME,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 TMPFILENAME=None |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
67 def launch(cmd): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
68 global CMD_PROC, BINOUT |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
69 CMD_PROC=Popen(shlex.split(cmd),stdin=PIPE,bufsize=0) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
70 BINOUT=CMD_PROC.stdin |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
71 |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
72 def windup(length,offset,filename): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
73 # Wind up subproc |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
74 BINOUT.close() |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
75 if CMD_PROC.wait()!=0: # could/should be async? |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
76 print("subproc of %s:%s:%s failed with %s"%(length,offset,filename, |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
77 CMD_PROC.returncode), |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
78 file=sys.stderr) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
79 |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 def _output_tmpfile(buf): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 TMPFILE.write(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 def _output_stdout(buf): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 BINOUT.write(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 def _output_subproc(buf): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 toWrite=len(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 while toWrite>0: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 toWrite -= BINOUT.write(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 def process1(options,buf,filename,offset,length,whole): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 root=options.root |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 rfn=root+filename |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 if root!="/beegfs/common_crawl": |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 # Support using ramdisk or other local disk as a faster cached |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 if not os.path.exists(rfn): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 if not os.path.exists(os.path.dirname(rfn)): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 os.makedirs(os.path.dirname(rfn)) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 with io.FileIO('/beegfs/common_crawl'+filename,'r') as infile, \ |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 io.FileIO(rfn,'w') as outfile: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 #shutil.copyfileobj(infile,outfile,128*1024*1024) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 while True: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 l=infile.readinto(buf) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 if l==0: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 break |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 outfile.write(memoryview(buf)[:l]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 file=open(rfn,'rb',0) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 file.seek(offset) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 bv=memoryview(buf)[:length] |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 nb=file.readinto(bv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 file.close() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 if nb!=length: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 raise ValueError("Chunk read losing: %s, got %s expected %s at %s"%(file.name, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 nb,length,offset)) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 if whole and options.zipped: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
116 _output(bv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 gzip_chunk = io.BytesIO(bv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 uv=memoryview(buf)[length:] |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 ll=0 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 while True: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 l=gzip_fin.readinto(uv) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
124 if not l: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
125 break |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
126 ll+=l |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
127 cb=memoryview(uv)[:ll] |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 if whole: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 _output(cb) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
131 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
132 state=0 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
133 tr=None # Was this record truncated? |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
134 bl=None # for HTTP Content-Length for the length of the body? |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
135 # Could we make this faster by working purely within the cb memoryview? |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
136 # It would be messy, but avoid copying huge amounts |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
137 # The outer loop would just be something like |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
138 # clbv=memoryview(bytearray(b"Content-Length: ")) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
139 # i=s=0 |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
140 # while i<ll: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
141 # if cb[i]==10: # need to handle \r\n |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
142 # L=cb[s:i] |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
143 # s=i=i+1 |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
144 # if L[:16]==clbv: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
145 # wl=int(L[16:]) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
146 # else: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
147 # i+=1 |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
148 # |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
149 with io.BytesIO(cb) as clear_text: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
150 for L in clear_text: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
151 if state==0: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
152 # WARC header |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
153 if L.startswith(b"Content-Length: "): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
154 wl=int(L[16:].rstrip()) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
155 elif L.startswith(b"WARC-Truncated: "): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
156 tr=L[16:].rstrip() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
157 tr="EMPTY" if tr=="" else tr |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
158 elif L==b"" or L.startswith(b"\r"): # for idempotency |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
159 # Blank line, WARC header is finished |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
160 if not (options.headers or options.body): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
161 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
162 state=1 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
163 # Note we preserve the empty line |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
164 if options.warc: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 _output(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
166 continue |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
167 if state==1: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
168 # HTTP header |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
169 wl -= len(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 if not (L==b"" or L.startswith(b"\r")): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
171 # Non-blank, it's a header |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
172 (h,_,v)=L.partition(b": ") |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
173 if bl is None and (h==b"Content-Length"): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
174 bl=int(v) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
175 if options.headers: |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
176 if isinstance(options.headers,dict): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
177 if h in options.headers: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
178 options.headers[h]=v |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
179 else: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
180 _output(L) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
181 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
182 # Blank line, HTTP header is finished |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
183 if isinstance(options.headers,dict): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
184 _output(bytes(str(options.headers),'utf-8')) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
185 if not options.body: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
186 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
187 if options.headers: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
188 _output(L) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
189 state=2 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
190 # The above is just for sanity, because we do _not_ |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
191 # continue with the outer loop, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
192 # since we can now block-output the entire rest of the |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
193 # input buffer. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
194 if bl is not None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
195 if bl!=wl: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
196 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
197 (length,offset,filename,wl,bl,tr),file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
198 # HTTP body |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
199 balance=clear_text.tell() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
200 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
201 # Output whatever is left |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
202 _output(cb[balance:balance+wl]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
203 return |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
204 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
205 def main(): |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
206 global _output,TMPFILE,TMPFILENAME,tempfile |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
207 parser = argparse.ArgumentParser( |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
208 description='''Extract records from warc files given length, offset and file triples. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
209 Input one triple on command line, or |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
210 triples from stdin as tab-delimited lines |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
211 or complete cdx index lines. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
212 In all cases by 'filename' is meant crawlid/segmentid/type/filename''', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
213 epilog='''Note that if no output flag(s) is/are given, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
214 the whole WARC record will be output, more efficiently than |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
215 would be the case if all three flags were given.''', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
216 add_help=False, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
217 conflict_handler='resolve', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
218 formatter_class=HackFormat |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
219 ) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
220 fphelp=('format string for turning 4 filename components into a path, must contain %%s exactly 4 times,\ndefault is "%s"'%FPAT).replace('%s','%%s') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
221 parser.add_argument('--help',help='Show help',action='help') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
222 parser.add_argument('-d','--debug',help='Debug output',action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
223 parser.add_argument('-w','--warc',help='output WARC headers', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
224 action='store_true') |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
225 parser.add_argument('-h','--headers',help='process HTTP headers: collect into dict with named values (,-separated) if arg present, else output', |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
226 nargs='?',default=None,const=True) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
227 parser.add_argument('-b','--body',help='output HTTP body', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
228 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
229 parser.add_argument('-c','--cmd',help='pipes each result thru CMD') |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
230 parser.add_argument('-p','--process',help='with -c, launches CMD only once', |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
231 action='store_true') |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
232 parser.add_argument('-m','--module.function',help='module.function to call with a stream'), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
233 parser.add_argument('-s','--save',action='store_true', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
234 help="write to a temporary file and output the name") |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
235 parser.add_argument('-f','--fpath', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
236 help=fphelp, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
237 default=FPAT) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
238 parser.add_argument('-r','--root',nargs='?', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
239 help='File path root, create a copy there if necessary', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
240 default='/beegfs/common_crawl'), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
241 parser.add_argument('-z','--zipped', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
242 help="output raw gzipped record, ignored if any of -bhw supplied", |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
243 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
244 sg=parser.add_mutually_exclusive_group() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
245 sg.add_argument('-x','--index', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
246 help='take lines of triples from a cdx index file as input', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
247 action='store_true') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
248 sg.add_argument('length',type=int, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
249 help='length in bytes of gzipped record', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
250 nargs='?') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
251 parser.add_argument('offset',type=int, |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
252 help='start position in bytes of gzipped record', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
253 nargs='?') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
254 parser.add_argument('filename', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
255 help='pathname of gzipped Common Crawl WARC-format file', |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
256 nargs='?') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
257 # Hack the order of optional and positional in the help output |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
258 parser._action_groups.sort(key=lambda g:g.title) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
259 #parser.print_help() |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
260 pa=parser.parse_args(sys.argv[1:]) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
261 #print(pa,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
262 if pa.length is not None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
263 # We have to enforce our own check.. |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
264 if pa.offset is None or pa.filename is None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
265 parser.error("length, offset and filename must all be supplied together") |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
266 if isinstance(pa.headers,str): |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
267 pa.headers=dict((bytes(k,'utf-8'),None) for k in pa.headers.split(',')) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
268 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
269 buf=bytearray(128*1024*1024) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
270 |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
271 whole=not (pa.warc or pa.headers or pa.body) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
272 if pa.save: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
273 _output=_output_tmpfile |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
274 import tempfile |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
275 elif pa.cmd: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
276 _output = _output_subproc |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
277 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
278 _output = _output_stdout |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
279 if pa.cmd and pa.process: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
280 launch(pa.cmd) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
281 # three different ways to process |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
282 if pa.index: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
283 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
284 for l in sys.stdin: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
285 m=CDX.search(l) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
286 if m is None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
287 if l.find('/robotstxt/')>-1: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
288 continue |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
289 print("index line problem: \"%s\""%l,file=sys.stderr,end='') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
290 exit(2) |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
291 filename=pa.fpath%(m[3:7]) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
292 process(pa,buf,filename, |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
293 int(offset:=m[2]),int(length:=m[1]),whole) |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
294 elif pa.length is not None: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
295 print(pa.filename,file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
296 process(pa,buf,pa.fpath%tuple(pa.filename.split('/')), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
297 pa.offset,pa.length,whole) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
298 else: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
299 print("Reading length, offset, filename tab-delimited triples from stdin...", |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
300 file=sys.stderr) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
301 for l in sys.stdin: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
302 try: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
303 (length,offset,filename)=l.rstrip().split('\t') |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
304 length=int(length) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
305 offset=int(offset) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
306 except ValueError as e: |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
307 parser.error('Invalid input line: %s\n "%s"'%(e,l)) |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
308 process(pa,buf,pa.fpath%tuple(filename.split('/')), |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
309 offset,length,whole) |
21
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
310 # processing done one way or another |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
311 if pa.cmd and pa.process: |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
312 windup(length,offset,filename) |
cbac7dfe2f24
interpolate process0, support permanent subproc
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
18
diff
changeset
|
313 # if pa.save and pa.process, deleting temp files is down to cmd |
18
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
314 if __name__ == "__main__": |
046dbe557911
write to tmp file implemented
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
315 main() |