comparison bin/ix.py @ 123:5b0ec642ee9b

silently skip robotstxt
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 17:16:15 +0000
parents bc958b776fb8
children b51d65ed6c89
comparison
equal deleted inserted replaced
122:9de06ae73372 123:5b0ec642ee9b
13 from subprocess import Popen, PIPE 13 from subprocess import Popen, PIPE
14 #import asyncio 14 #import asyncio
15 15
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') 16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
17 BINOUT=sys.stdout.buffer 17 BINOUT=sys.stdout.buffer
18 FPAT="/%s/%s/orig/warc/%s" 18 FPAT="/%s/%s/orig/%s/%s"
19 19
20 class HackFormat(argparse.RawDescriptionHelpFormatter): 20 class HackFormat(argparse.RawDescriptionHelpFormatter):
21 def format_help(self): 21 def format_help(self):
22 global FOO 22 global FOO
23 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) 23 FOO=argparse.RawDescriptionHelpFormatter.format_help(self)
198 if pa.cmd: 198 if pa.cmd:
199 _output = _output_subproc 199 _output = _output_subproc
200 else: 200 else:
201 _output = _output_stdout 201 _output = _output_stdout
202 if pa.index: 202 if pa.index:
203 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') 203 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
204 for l in sys.stdin: 204 for l in sys.stdin:
205 m=CDX.search(l) 205 m=CDX.search(l)
206 if m is None: 206 if m is None:
207 if l.find('/robotstxt/')>-1:
208 continue
207 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) 209 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
208 exit(2) 210 exit(2)
209 f=FPAT%(m[3:6]) 211 f=FPAT%(m[3:7])
210 process(pa,buf,f, 212 process(pa,buf,f,
211 int(m[2]),int(m[1]),whole) 213 int(m[2]),int(m[1]),whole)
212 elif pa.length is not None: 214 elif pa.length is not None:
213 print(pa.filename,file=sys.stderr) 215 print(pa.filename,file=sys.stderr)
214 process(pa,buf,FPAT%tuple(pa.filename.split('/')), 216 process(pa,buf,FPAT%tuple(pa.filename.split('/')),